X-Git-Url: http://git.uio.no/git/?p=u%2Fmrichter%2FAliRoot.git;a=blobdiff_plain;f=SHUTTLE%2FAliShuttle.cxx;h=3673410d503697fd173b34538489fcaf05ca273a;hp=ba681caf028961983c721084f558d28066dce5f6;hb=a9dab9ea0c0f2b1160c9d2fc6419d8c33c46bdbc;hpb=c19963dbbfc684c51f47ab2eac2f5d56353526ef diff --git a/SHUTTLE/AliShuttle.cxx b/SHUTTLE/AliShuttle.cxx index ba681caf028..3673410d503 100644 --- a/SHUTTLE/AliShuttle.cxx +++ b/SHUTTLE/AliShuttle.cxx @@ -28,6 +28,7 @@ // For detSpec is used the alias name. // +#include #include "AliShuttle.h" #include "AliCDBManager.h" @@ -57,6 +58,7 @@ #include #include #include +#include #include @@ -66,6 +68,7 @@ #include #include +using namespace std; ClassImp(AliShuttle) @@ -77,6 +80,8 @@ fTimeout(timeout), fRetries(retries), fPreprocessorMap(), fLogbookEntry(0), fCurrentDetector(), +fFirstProcessing(0), +fFXSError(-1), fStatusEntry(0), fMonitoringMutex(0), fLastActionTime(0), @@ -93,9 +98,9 @@ fOutputRedirected(kFALSE) // if (!fConfig->IsValid()) AliFatal("********** !!!!! Invalid configuration !!!!! **********"); - for(int iSys=0;iSys<4;iSys++) { + for(int iSys=0;iSys<5;iSys++) { fServer[iSys]=0; - if (iSys < 3) + if (iSys < 4) fFXSlist[iSys].SetOwner(kTRUE); } fPreprocessorMap.SetOwner(kTRUE); @@ -114,7 +119,7 @@ AliShuttle::~AliShuttle() // fPreprocessorMap.DeleteAll(); - for(int iSys=0;iSys<4;iSys++) + for(int iSys=0;iSys<5;iSys++) if(fServer[iSys]) { fServer[iSys]->Close(); delete fServer[iSys]; @@ -197,6 +202,7 @@ Bool_t AliShuttle::StoreLocally(const TString& localUri, // // returns 0 if fail, 1 otherwise + if (fTestMode & kErrorStorage) { Log(fCurrentDetector, "StoreLocally - In TESTMODE - Simulating error while storing locally"); @@ -231,8 +237,11 @@ Bool_t AliShuttle::StoreLocally(const TString& localUri, if (!(AliCDBManager::Instance()->GetStorage(localUri))) { Log("SHUTTLE", Form("StoreLocally - Cannot activate local %s storage", cdbType)); } else { + Int_t logLevel = AliLog::GetGlobalLogLevel(); + AliLog::SetGlobalLogLevel(AliLog::kError); result = AliCDBManager::Instance()->GetStorage(localUri) ->Put(object, id, metaData); + AliLog::SetGlobalLogLevel((AliLog::EType_t)logLevel); } if(!result) { @@ -240,6 +249,7 @@ Bool_t AliShuttle::StoreLocally(const TString& localUri, Log(fCurrentDetector, Form("StoreLocally - Can't store object <%s>!", id.ToString().Data())); } + return result; } @@ -252,6 +262,8 @@ Bool_t AliShuttle::StoreOCDB() // Then calls StoreRefFilesToGrid to store reference files. // + UpdateShuttleStatus(AliShuttleStatus::kStoreStarted); + if (fTestMode & kErrorGrid) { Log("SHUTTLE", "StoreOCDB - In TESTMODE - Simulating error while storing in the Grid"); @@ -260,10 +272,10 @@ Bool_t AliShuttle::StoreOCDB() } Log("SHUTTLE","StoreOCDB - Storing OCDB data ..."); - Bool_t resultCDB = StoreOCDB(fgkMainCDB); + Int_t resultCDB = StoreOCDB(fgkMainCDB); Log("SHUTTLE","StoreOCDB - Storing reference data ..."); - Bool_t resultRef = StoreOCDB(fgkMainRefStorage); + Int_t resultRef = StoreOCDB(fgkMainRefStorage); Log("SHUTTLE","StoreOCDB - Storing reference files ..."); Bool_t resultRefFiles = CopyFilesToGrid("reference"); @@ -271,23 +283,60 @@ Bool_t AliShuttle::StoreOCDB() Bool_t resultMetadata = kTRUE; if(fCurrentDetector == "GRP") { - Log("StoreOCDB - SHUTTLE","Storing Run Metadata file ..."); + Log("SHUTTLE","StoreOCDB - Storing Run Metadata file ..."); resultMetadata = CopyFilesToGrid("metadata"); } - return resultCDB && resultRef && resultRefFiles && resultMetadata; + Int_t storeResult = 0; + + if (resultCDB < 0 || resultRef < 0 || resultRefFiles == kFALSE || resultMetadata == kFALSE) + storeResult = -1; + else if (resultCDB > 0 || resultRef > 0) + storeResult = 1; + + if (storeResult < 0) + { + Log("SHUTTLE", + Form("\t\t\t****** run %d - %s: STORAGE ERROR ******", + GetCurrentRun(), fCurrentDetector.Data())); + UpdateShuttleStatus(AliShuttleStatus::kStoreError); + } + else if (storeResult > 0) + { + Log("SHUTTLE", + Form("\t\t\t****** run %d - %s: STORAGE DELAYED ******", + GetCurrentRun(), fCurrentDetector.Data())); + UpdateShuttleStatus(AliShuttleStatus::kStoreDelayed); + } + else if (storeResult == 0) + { + Log("SHUTTLE", + Form("\t\t\t****** run %d - %s: DONE ******", + GetCurrentRun(), fCurrentDetector.Data())); + UpdateShuttleStatus(AliShuttleStatus::kDone); + UpdateShuttleLogbook(fCurrentDetector, "DONE"); + } + + return (storeResult == 0); } //______________________________________________________________________________________________ -Bool_t AliShuttle::StoreOCDB(const TString& gridURI) +Int_t AliShuttle::StoreOCDB(const TString& gridURI) { // // Called by StoreOCDB(), performs actual storage to the main OCDB and reference storages (Grid) // + // Return code: + // -2 initialization error + // -1 storage error + // 0 success + // 1 storage delayed (e.g. previous unprocessed runs) + // TObjArray* gridIds=0; Bool_t result = kTRUE; + Bool_t delayed = kFALSE; const char* type = 0; TString localURI; @@ -299,7 +348,7 @@ Bool_t AliShuttle::StoreOCDB(const TString& gridURI) localURI = fgkLocalRefStorage; } else { AliError(Form("Invalid storage URI: %s", gridURI.Data())); - return kFALSE; + return -2; } AliCDBManager* man = AliCDBManager::Instance(); @@ -308,8 +357,9 @@ Bool_t AliShuttle::StoreOCDB(const TString& gridURI) if(!gridSto) { Log("SHUTTLE", Form("StoreOCDB - cannot activate main %s storage", type)); - return kFALSE; + return -2; } + gridSto->SetMirrorSEs(fgkMirrorSEs.Data()); gridIds = gridSto->GetQueryCDBList(); @@ -318,7 +368,7 @@ Bool_t AliShuttle::StoreOCDB(const TString& gridURI) if(!localSto) { Log("SHUTTLE", Form("StoreOCDB - cannot activate local %s storage", type)); - return kFALSE; + return -2; } AliCDBPath aPath(GetOfflineDetName(fCurrentDetector.Data()),"*","*"); // Local objects were stored with current run as Grid version! @@ -333,6 +383,8 @@ Bool_t AliShuttle::StoreOCDB(const TString& gridURI) AliCDBId aLocId = aLocEntry->GetId(); aLocEntry->SetVersion(-1); aLocEntry->SetSubVersion(-1); + + Log(fCurrentDetector.Data(), Form("Attempting to store %s", aLocId.ToString().Data())); // If local object is valid up to infinity we store it only if it is // the first unprocessed run! @@ -342,7 +394,10 @@ Bool_t AliShuttle::StoreOCDB(const TString& gridURI) Log("SHUTTLE", Form("StoreOCDB - %s: object %s has validity infinite but " "there are previous unprocessed runs!", fCurrentDetector.Data(), aLocId.GetPath().Data())); - result = kFALSE; + Log(fCurrentDetector.Data(), Form("StoreOCDB - %s: object %s has validity infinite but " + "there are previous unprocessed runs!", + fCurrentDetector.Data(), aLocId.GetPath().Data())); + delayed = kTRUE; continue; } @@ -350,52 +405,66 @@ Bool_t AliShuttle::StoreOCDB(const TString& gridURI) Bool_t store = kTRUE; TIter gridIter(gridIds); AliCDBId* aGridId = 0; - while((aGridId = dynamic_cast (gridIter.Next()))){ - if(aGridId->GetPath() != aLocId.GetPath()) continue; + while ((aGridId = dynamic_cast (gridIter.Next()))) { + if (aGridId->GetPath() != aLocId.GetPath()) + continue; // skip all objects valid up to infinity - if(aGridId->GetLastRun() == AliCDBRunRange::Infinity()) continue; + if (aGridId->GetLastRun() == AliCDBRunRange::Infinity()) + continue; + // if we get here, it means there's already some more recent object stored on Grid! + Log(fCurrentDetector.Data(), + Form("StoreOCDB - A more recent object already exists in %s storage: <%s>", + type, aGridId->ToString().Data())); + store = kFALSE; break; } - // If we get here, the file can be stored! - Bool_t storeOk = gridSto->Put(aLocEntry); - if(!store || storeOk){ - - if (!store) - { - Log(fCurrentDetector.Data(), - Form("StoreOCDB - A more recent object already exists in %s storage: <%s>", - type, aGridId->ToString().Data())); - } else { + Bool_t storeOk = kFALSE; + if (store) + { + Log(fCurrentDetector.Data(), Form("Prechecks succeeded. Ready to store %s", aLocId.ToString().Data())); + storeOk = gridSto->Put(aLocEntry); + if (storeOk) { Log("SHUTTLE", - Form("StoreOCDB - Object <%s> successfully put into %s storage", - aLocId.ToString().Data(), type)); + Form("StoreOCDB - Object <%s> successfully put into %s storage", + aLocId.ToString().Data(), type)); Log(fCurrentDetector.Data(), Form("StoreOCDB - Object <%s> successfully put into %s storage", - aLocId.ToString().Data(), type)); + aLocId.ToString().Data(), type)); + } else { + Log("SHUTTLE", + Form("StoreOCDB - Grid %s storage of object <%s> failed", + type, aLocId.ToString().Data())); + Log(fCurrentDetector.Data(), + Form("StoreOCDB - Grid %s storage of object <%s> failed", + type, aLocId.ToString().Data())); + result = kFALSE; } - - // removing local filename... + } + + if (!store || storeOk) { + // removing local file... TString filename; localSto->IdToFilename(aLocId, filename); Log("SHUTTLE", Form("StoreOCDB - Removing local file %s", filename.Data())); RemoveFile(filename.Data()); - continue; - } else { - Log("SHUTTLE", - Form("StoreOCDB - Grid %s storage of object <%s> failed", - type, aLocId.ToString().Data())); - Log(fCurrentDetector.Data(), - Form("StoreOCDB - Grid %s storage of object <%s> failed", - type, aLocId.ToString().Data())); - result = kFALSE; } } localEntries->Clear(); - return result; + Int_t returnCode = 0; + + if (result == kFALSE) + returnCode = -1; + else if (delayed != kFALSE) + returnCode = 1; + + Log("SHUTTLE", Form("StoreOCDB - Returning with %d (result = %d, delayed = %d)", returnCode, result, delayed)); + Log(fCurrentDetector.Data(), Form("StoreOCDB - Returning with %d (result = %d, delayed = %d)", returnCode, result, delayed)); + + return returnCode; } //______________________________________________________________________________________________ @@ -423,7 +492,7 @@ Bool_t AliShuttle::CleanReferenceStorage(const char* detector) if (!dirList) return kTRUE; - if (dirList->GetEntries() < 3) + if (dirList->GetEntries() < 3) // to be changed to 4? { delete dirList; return kTRUE; @@ -561,8 +630,8 @@ Bool_t AliShuttle::StoreRunMetadataFile(const char* localFile, const char* gridF lhcPeriod.Data())); } - TString target = Form("%s/GRP/RunMetadata/alice/data/%d/%s/%09d/raw/%s", - localBaseFolder.Data(), GetCurrentYear(), + TString target = Form("%s/GRP/RunMetadata%s%d/%s/%09d/raw/%s", + localBaseFolder.Data(), fConfig->GetAlienPath(), GetCurrentYear(), lhcPeriod.Data(), GetCurrentRun(), gridFileName); return CopyFileLocally(localFile, target); @@ -681,10 +750,10 @@ Bool_t AliShuttle::CopyFilesToGrid(const char* type) lhcPeriod.Append(Form("_%s", partition.Data())); } - dir = Form("%s/GRP/RunMetadata/alice/data/%d/%s/%09d/raw", - localBaseFolder.Data(), GetCurrentYear(), + dir = Form("%s/GRP/RunMetadata%s%d/%s/%09d/raw", + localBaseFolder.Data(), fConfig->GetAlienPath(), GetCurrentYear(), lhcPeriod.Data(), GetCurrentRun()); - alienDir = dir(dir.Index("/alice/data/"), dir.Length()); + alienDir = dir(dir.Index(fConfig->GetAlienPath()), dir.Length()); begin = ""; } @@ -804,7 +873,7 @@ const char* AliShuttle::GetRefFilePrefix(const char* base, const char* detector) // TString offDetStr(GetOfflineDetName(detector)); - TString dir; + static TString dir; if (offDetStr == "ITS" || offDetStr == "MUON" || offDetStr == "PHOS") { dir.Form("%s/%s/%s", base, offDetStr.Data(), detector); @@ -813,8 +882,6 @@ const char* AliShuttle::GetRefFilePrefix(const char* base, const char* detector) } return dir.Data(); - - } //______________________________________________________________________________________________ @@ -885,8 +952,13 @@ AliShuttleStatus* AliShuttle::ReadShuttleStatus() fStatusEntry = 0; } - fStatusEntry = AliCDBManager::Instance()->GetStorage(GetLocalCDB()) - ->Get(Form("/SHUTTLE/STATUS/%s", fCurrentDetector.Data()), GetCurrentRun()); + Int_t path1 = GetCurrentRun()/10000; + try{ + fStatusEntry = AliCDBManager::Instance()->GetStorage(GetLocalCDB()) + ->Get(Form("/SHUTTLE/%s/%d", fCurrentDetector.Data(), path1), GetCurrentRun()); + } catch(std::exception& x) { + AliInfo(TString::Format("%s",x.what())); + } if (!fStatusEntry) return 0; fStatusEntry->SetOwner(1); @@ -913,21 +985,28 @@ Bool_t AliShuttle::WriteShuttleStatus(AliShuttleStatus* status) } Int_t run = GetCurrentRun(); + Int_t path1 = run/10000; + TString path1_string = Form("%d",path1); - AliCDBId id(AliCDBPath("SHUTTLE", "STATUS", fCurrentDetector), run, run); + AliCDBId id(AliCDBPath("SHUTTLE", fCurrentDetector, path1_string), run, run); fStatusEntry = new AliCDBEntry(status, id, new AliCDBMetaData); fStatusEntry->SetOwner(1); + Int_t logLevel = AliLog::GetGlobalLogLevel(); + AliLog::SetGlobalLogLevel(AliLog::kError); + UInt_t result = AliCDBManager::Instance()->GetStorage(fgkLocalCDB)->Put(fStatusEntry); - if (!result) { + AliLog::SetGlobalLogLevel((AliLog::EType_t)logLevel); + + if (!result) { Log("SHUTTLE", Form("WriteShuttleStatus - Failed for %s, run %d", fCurrentDetector.Data(), run)); return kFALSE; } - SendMLInfo(); + SendMLDetInfo(); return kTRUE; } @@ -961,13 +1040,18 @@ void AliShuttle::UpdateShuttleStatus(AliShuttleStatus::Status newStatus, Bool_t status->SetStatus(newStatus); if (increaseCount) status->IncreaseCount(); + Int_t logLevel = AliLog::GetGlobalLogLevel(); + AliLog::SetGlobalLogLevel(AliLog::kError); + AliCDBManager::Instance()->GetStorage(fgkLocalCDB)->Put(fStatusEntry); - SendMLInfo(); + AliLog::SetGlobalLogLevel((AliLog::EType_t)logLevel); + + SendMLDetInfo(); } //______________________________________________________________________________________________ -void AliShuttle::SendMLInfo() +void AliShuttle::SendMLDetInfo() { // // sends ML information about the current status of the current detector being processed @@ -976,7 +1060,7 @@ void AliShuttle::SendMLInfo() AliShuttleStatus* status = dynamic_cast (fStatusEntry->GetObject()); if (!status){ - Log("SHUTTLE", "SendMLInfo - UNEXPECTED: status could not be read from current CDB entry"); + Log("SHUTTLE", "SendMLDetInfo - UNEXPECTED: status could not be read from current CDB entry"); return; } @@ -999,20 +1083,21 @@ Bool_t AliShuttle::ContinueProcessing() // checks if the processing should be continued // if yes it returns kTRUE and updates the AliShuttleStatus with nextStatus - if (!fConfig->HostProcessDetector(fCurrentDetector)) return kFALSE; + if (!fConfig->HostProcessDetector(fCurrentDetector)) + return kFALSE; AliPreprocessor* aPreprocessor = dynamic_cast (fPreprocessorMap.GetValue(fCurrentDetector)); if (!aPreprocessor) { - Log("SHUTTLE", Form("ContinueProcessing - %s: no preprocessor registered", fCurrentDetector.Data())); - return kFALSE; + Log("SHUTTLE", Form("ContinueProcessing - %s: no preprocessor registered", fCurrentDetector.Data())); + return kFALSE; } AliShuttleLogbookEntry::Status entryStatus = fLogbookEntry->GetDetectorStatus(fCurrentDetector); - if(entryStatus != AliShuttleLogbookEntry::kUnprocessed) { + if (entryStatus != AliShuttleLogbookEntry::kUnprocessed) { Log("SHUTTLE", Form("ContinueProcessing - %s is %s", fCurrentDetector.Data(), fLogbookEntry->GetDetectorStatusName(entryStatus))); @@ -1028,7 +1113,7 @@ Bool_t AliShuttle::ContinueProcessing() if (fTestMode == kNone) { Log("SHUTTLE", Form("ContinueProcessing - %s requires strict run ordering" - " but this is not the first unprocessed run!")); + " but this is not the first unprocessed run!",fCurrentDetector.Data())); return kFALSE; } else @@ -1036,49 +1121,48 @@ Bool_t AliShuttle::ContinueProcessing() Log("SHUTTLE", Form("ContinueProcessing - In TESTMODE - " "Although %s requires strict run ordering " "and this is not the first unprocessed run, " - "the SHUTTLE continues")); + "the SHUTTLE continues",fCurrentDetector.Data())); } } + // Is the subdetector processed first time for this run? + fFirstProcessing = kFALSE; + AliShuttleStatus* status = ReadShuttleStatus(); if (!status) { // first time Log("SHUTTLE", Form("ContinueProcessing - %s: Processing first time", fCurrentDetector.Data())); status = new AliShuttleStatus(AliShuttleStatus::kStarted); + fFirstProcessing = kTRUE; return WriteShuttleStatus(status); } - // The following two cases shouldn't happen if Shuttle Logbook was correctly updated. + // The following case shouldn't happen if Shuttle Logbook was correctly updated. // If it happens it may mean Logbook updating failed... let's do it now! if (status->GetStatus() == AliShuttleStatus::kDone || - status->GetStatus() == AliShuttleStatus::kFailed){ + status->GetStatus() == AliShuttleStatus::kFailed || + status->GetStatus() == AliShuttleStatus::kSkipped) { Log("SHUTTLE", Form("ContinueProcessing - %s is already %s. Updating Shuttle Logbook", fCurrentDetector.Data(), status->GetStatusName(status->GetStatus()))); - UpdateShuttleLogbook(fCurrentDetector.Data(), - status->GetStatusName(status->GetStatus())); + + if (status->GetStatus() == AliShuttleStatus::kSkipped) + { + UpdateShuttleLogbook(fCurrentDetector.Data(), "DONE"); + } + else + UpdateShuttleLogbook(fCurrentDetector.Data(), status->GetStatusName(status->GetStatus())); + return kFALSE; } - if (status->GetStatus() == AliShuttleStatus::kStoreStarted || status->GetStatus() == AliShuttleStatus::kStoreError) { + if (status->GetStatus() == AliShuttleStatus::kStoreStarted || status->GetStatus() == AliShuttleStatus::kStoreDelayed ||status->GetStatus() == AliShuttleStatus::kStoreError) { Log("SHUTTLE", Form("ContinueProcessing - %s: Grid storage of one or more " "objects failed. Trying again now", fCurrentDetector.Data())); - UpdateShuttleStatus(AliShuttleStatus::kStoreStarted); - if (StoreOCDB()){ - Log("SHUTTLE", Form("ContinueProcessing - %s: all objects " - "successfully stored into main storage", - fCurrentDetector.Data())); - UpdateShuttleStatus(AliShuttleStatus::kDone); - UpdateShuttleLogbook(fCurrentDetector, "DONE"); - } else { - Log("SHUTTLE", - Form("ContinueProcessing - %s: Grid storage failed again", - fCurrentDetector.Data())); - UpdateShuttleStatus(AliShuttleStatus::kStoreError); - } + StoreOCDB(); return kFALSE; } @@ -1103,10 +1187,33 @@ Bool_t AliShuttle::ContinueProcessing() // Send mail to detector expert! Log("SHUTTLE", Form("ContinueProcessing - Sending mail to %s expert...", - fCurrentDetector.Data())); - if (!SendMail()) + fCurrentDetector.Data())); + // det experts in to + TString to=""; + TIter *iterExperts = 0; + iterExperts = new TIter(fConfig->GetResponsibles(fCurrentDetector)); + TObjString *anExpert=0; + while ((anExpert = (TObjString*) iterExperts->Next())) + { + to += Form("%s, \n", anExpert->GetName()); + } + delete iterExperts; + + if (to.Length() > 0) + to.Remove(to.Length()-3); + AliDebug(2, Form("to: %s",to.Data())); + + if (to.IsNull()) { + Log("SHUTTLE", Form("List of %s responsibles not set!", fCurrentDetector.Data())); + return kFALSE; + } + + Log(fCurrentDetector.Data(), Form("ContinueProcessing - Sending mail to %s expert(s):", + fCurrentDetector.Data())); + Log(fCurrentDetector.Data(), Form("\n%s", to.Data())); + if (!SendMail(kPPEMail)) Log("SHUTTLE", Form("ContinueProcessing - Could not send mail to %s expert", - fCurrentDetector.Data())); + fCurrentDetector.Data())); } else { Log("SHUTTLE", Form("ContinueProcessing - %s: restarting. " @@ -1114,7 +1221,9 @@ Bool_t AliShuttle::ContinueProcessing() status->GetStatusName(), status->GetCount())); Bool_t increaseCount = kTRUE; if (status->GetStatus() == AliShuttleStatus::kDCSError || - status->GetStatus() == AliShuttleStatus::kDCSStarted) + status->GetStatus() == AliShuttleStatus::kDCSStarted || + status->GetStatus() == AliShuttleStatus::kFXSError || + status->GetStatus() == AliShuttleStatus::kOCDBError) increaseCount = kFALSE; UpdateShuttleStatus(AliShuttleStatus::kStarted, increaseCount); @@ -1124,6 +1233,71 @@ Bool_t AliShuttle::ContinueProcessing() return cont; } +//______________________________________________________________________________________________ +void AliShuttle::SendMLRunInfo(const char* status) +{ + // + // Send information about this run to ML + + TMonaLisaText mlStatus("SHUTTLE_status", status); + TString runType(fLogbookEntry->GetRunType()); + if (strlen(fLogbookEntry->GetRunParameter("log")) > 0){ + + runType += "("; + runType += fLogbookEntry->GetRunParameter("log"); + runType += ")"; + } + if (fLogbookEntry->GetDATestMode()){ + runType += " (DATest)"; + } + TMonaLisaText mlRunType("SHUTTLE_runtype", runType); + + TList mlList; + mlList.Add(&mlStatus); + mlList.Add(&mlRunType); + + TString mlID; + mlID.Form("%d", GetCurrentRun()); + fMonaLisa->SendParameters(&mlList, mlID); +} + +//______________________________________________________________________________________________ +Int_t AliShuttle::GetMem(Int_t pid) +{ + // invokes ps to get the memory consumption of the process + // returns -1 in case of error + + TString checkStr; + checkStr.Form("ps -o vsize --pid %d | tail -n 1", pid); + FILE* pipe = gSystem->OpenPipe(checkStr, "r"); + if (!pipe) + { + Log("SHUTTLE", Form("Process - Error: " + "Could not open pipe to %s", checkStr.Data())); + return -1; + } + + char buffer[100]; + if (!fgets(buffer, 100, pipe)) + { + Log("SHUTTLE", "Process - Error: ps did not return anything"); + gSystem->ClosePipe(pipe); + return -1; + } + gSystem->ClosePipe(pipe); + + //Log("SHUTTLE", Form("ps returned %s", buffer)); + + Int_t mem = 0; + if ((sscanf(buffer, "%d\n", &mem) != 1) || !mem) + { + Log("SHUTTLE", "Process - Error: Could not parse output of ps"); + return -1; + } + + return mem; +} + //______________________________________________________________________________________________ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry) { @@ -1141,17 +1315,10 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry) Log("SHUTTLE", Form("\t\t\t^*^*^*^*^*^*^*^*^*^*^*^* run %d: START ^*^*^*^*^*^*^*^*^*^*^*^*", GetCurrentRun())); + CountOpenRuns(); + // Send the information to ML - TMonaLisaText mlStatus("SHUTTLE_status", "Processing"); - TMonaLisaText mlRunType("SHUTTLE_runtype", Form("%s (%s)", entry->GetRunType(), entry->GetRunParameter("log"))); - - TList mlList; - mlList.Add(&mlStatus); - mlList.Add(&mlRunType); - - TString mlID; - mlID.Form("%d", GetCurrentRun()); - fMonaLisa->SendParameters(&mlList, mlID); + SendMLRunInfo("Processing"); if (fLogbookEntry->IsDone()) { @@ -1199,22 +1366,27 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry) // Initialization Bool_t hasError = kFALSE; - // Set the CDB and Reference folders according to the year and LHC period - TString lhcPeriod(GetLHCPeriod()); - if (lhcPeriod.Length() == 0) - { - Log("SHUTTLE","Process - LHCPeriod not found in logbook!"); - return 0; - } - - if (fgkMainCDB.Length() == 0) - fgkMainCDB = Form("alien://folder=/alice/data/%d/%s/OCDB?user=alidaq?cacheFold=/tmp/OCDBCache", - GetCurrentYear(), lhcPeriod.Data()); - - if (fgkMainRefStorage.Length() == 0) - fgkMainRefStorage = Form("alien://folder=/alice/data/%d/%s/Reference?user=alidaq?cacheFold=/tmp/OCDBCache", - GetCurrentYear(), lhcPeriod.Data()); - + // Set the CDB and Reference folders according to the year + + // build cdb paths (repeat each time, run might be a DATest run) + if (!fLogbookEntry->GetDATestMode()){ + fgkMainCDB.Form("alien://folder=%s%d/OCDB?user=alidaq?cacheFold=/tmp/OCDBCache", + fConfig->GetAlienPath(), GetCurrentYear()); + + fgkMainRefStorage.Form("alien://folder=%s%d/Reference?user=alidaq?cacheFold=/tmp/OCDBCache", + fConfig->GetAlienPath(), GetCurrentYear()); + } + else { + fgkMainCDB.Form("alien://folder=%s%d/DATest/OCDB?user=alidaq?cacheFold=/tmp/OCDBCache", + fConfig->GetAlienPath(), GetCurrentYear()); + + fgkMainRefStorage.Form("alien://folder=%s%d/DATest/Reference?user=alidaq?cacheFold=/tmp/OCDBCache", + fConfig->GetAlienPath(), GetCurrentYear()); + } + + AliDebug(2,Form("Main CDB storage = %s",fgkMainCDB.Data())); + AliDebug(2,Form("Main Reference storage = %s",fgkMainRefStorage.Data())); + // Loop on detectors in the configuration TIter iter(fConfig->GetDetectors()); TObjString* aDetector = 0; @@ -1225,7 +1397,8 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry) { fCurrentDetector = aDetector->String(); - if (ContinueProcessing() == kFALSE) continue; + if (ContinueProcessing() == kFALSE) + continue; if (first) { @@ -1240,7 +1413,10 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry) Log("SHUTTLE", Form("\t\t\t****** run %d - %s: START ******", GetCurrentRun(), aDetector->GetName())); - for(Int_t iSys=0;iSys<3;iSys++) fFXSCalled[iSys]=kFALSE; + for(Int_t iSys=0;iSys<4;iSys++) fFXSCalled[iSys]=kFALSE; + + Int_t initialMem = GetMem(getpid()); + Log("SHUTTLE", Form("Memory consumption before forking is %d", initialMem)); Log(fCurrentDetector.Data(), "Process - Starting processing"); @@ -1263,18 +1439,56 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry) { Long_t expiredTime = time(0) - begin; - if (expiredTime > fConfig->GetPPTimeOut()) + // the run-dependent timeout is the timeout from the configuration plus a twentieth of + // the run duration, e.g. 3 additional minutes for 1h run or 1/2h for a 10h run + Int_t runDepTimeOut = fConfig->GetPPTimeOut() + (GetCurrentEndTime() - GetCurrentStartTime()) * 0.05; + if (expiredTime > runDepTimeOut) { - TString tmp; - tmp.Form("Process - Process of %s time out. " - "Run time: %d seconds. Killing...", - fCurrentDetector.Data(), expiredTime); - Log("SHUTTLE", tmp); - Log(fCurrentDetector, tmp); + TString logMsg; + AliShuttleStatus *currentStatus = ReadShuttleStatus(); + AliShuttleStatus::Status newStatus = AliShuttleStatus::kInvalid; + + if (currentStatus->GetStatus() == AliShuttleStatus::kDCSStarted) + { + // in case the pp goes in TimeOut while retrieving the DCS DPs + // set status to kDCSError + + logMsg.Form("Process - Process of %s timed out while retrieving the DCS DataPoints. Run time: %ld seconds. Killing... and setting status to DCSError.", + fCurrentDetector.Data(), expiredTime); + newStatus = AliShuttleStatus::kDCSError; + } + else if (currentStatus->GetStatus() <= AliShuttleStatus::kPPDone) + { + // in case pp not yet done set status to kPPTimeOut + + logMsg.Form("Process - Process of %s timed out. Run time: %ld seconds. Killing...", + fCurrentDetector.Data(), expiredTime); + newStatus = AliShuttleStatus::kPPTimeOut; + } + else if (currentStatus->GetStatus() == AliShuttleStatus::kStoreStarted) + { + // in case the pp goes in TimeOut while storing the objects in the OCDB + // set status to kStoreError + + logMsg.Form("Process - Process of %s timed out while storing the OCDB object. Run time: %ld seconds. Killing... and setting status to StoreError.", + fCurrentDetector.Data(), expiredTime); + newStatus = AliShuttleStatus::kStoreError; + } + else + { + // in other cases don't change the status + + logMsg.Form("Process - Process of %s timed out in status = %s. Run time: %ld seconds. Killing... without changing the status", + fCurrentDetector.Data(), currentStatus->GetStatusName(), expiredTime); + } + + Log("SHUTTLE", logMsg); + Log(fCurrentDetector, logMsg); kill(pid, 9); - UpdateShuttleStatus(AliShuttleStatus::kPPTimeOut); + if (newStatus != AliShuttleStatus::kInvalid) + UpdateShuttleStatus(newStatus); hasError = kTRUE; gSystem->Sleep(1000); @@ -1283,38 +1497,19 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry) { gSystem->Sleep(1000); - TString checkStr; - checkStr.Form("ps -o vsize --pid %d | tail -n 1", pid); - FILE* pipe = gSystem->OpenPipe(checkStr, "r"); - if (!pipe) - { - Log("SHUTTLE", Form("Process - Error: " - "Could not open pipe to %s", checkStr.Data())); + Int_t mem = GetMem(pid); + + if (mem < 0) continue; - } - char buffer[100]; - if (!fgets(buffer, 100, pipe)) - { - Log("SHUTTLE", "Process - Error: ps did not return anything"); - gSystem->ClosePipe(pipe); - continue; - } - gSystem->ClosePipe(pipe); - - //Log("SHUTTLE", Form("ps returned %s", buffer)); - - Int_t mem = 0; - if ((sscanf(buffer, "%d\n", &mem) != 1) || !mem) - { - Log("SHUTTLE", "Process - Error: Could not parse output of ps"); - continue; - } + mem -= initialMem; + if (mem < 0) + mem = 0; if (expiredTime % 60 == 0) { Log("SHUTTLE", Form("Process - %s: Checking process. " - "Run time: %d seconds - Memory consumption: %d KB", + "Run time: %ld seconds - Memory consumption: %d KB", fCurrentDetector.Data(), expiredTime, mem)); SendAlive(); } @@ -1353,8 +1548,8 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry) } else if (pid == 0) { - // client - Log("SHUTTLE", Form("Process - In client process of %d - %s", GetCurrentRun(), + // child + Log("SHUTTLE", Form("Process - In child process of %d - %s", GetCurrentRun(), aDetector->GetName())); Log("SHUTTLE", Form("Process - Redirecting output to %s log",fCurrentDetector.Data())); @@ -1370,10 +1565,14 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry) Log("SHUTTLE", "Process - Could not redirect stderr"); } + + Log("SHUTTLE", "Executing TGrid::Connect"); + TGrid::Connect("alien://"); TString wd = gSystem->WorkingDirectory(); - TString tmpDir = Form("%s/%s_%d_process", GetShuttleTempDir(), - fCurrentDetector.Data(), GetCurrentRun()); + Int_t dir_lev1 = GetCurrentRun()/10000; + TString tmpDir = Form("%s/%d/%d/%s_process", GetShuttleTempDir(), + dir_lev1, GetCurrentRun(), fCurrentDetector.Data()); Int_t result = gSystem->GetPathInfo(tmpDir.Data(), 0, (Long64_t*) 0, 0, 0); if (!result) // temp dir already exists! @@ -1395,11 +1594,11 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry) gSystem->Exit(1); } - Bool_t success = ProcessCurrentDetector(); - + Int_t success = ProcessCurrentDetector(); + gSystem->ChangeDirectory(wd.Data()); - if (success) // Preprocessor finished successfully! + if (success == 1) // Preprocessor finished successfully! { // remove temporary folder or DCS map if (!fConfig->KeepTempFolder()) @@ -1416,29 +1615,17 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry) fCurrentDetector.Data())); // Transfer the data from local storage to main storage (Grid) - UpdateShuttleStatus(AliShuttleStatus::kStoreStarted); if (StoreOCDB() == kFALSE) - { - Log("SHUTTLE", - Form("\t\t\t****** run %d - %s: STORAGE ERROR ******", - GetCurrentRun(), aDetector->GetName())); - UpdateShuttleStatus(AliShuttleStatus::kStoreError); success = kFALSE; - } else { - Log("SHUTTLE", - Form("\t\t\t****** run %d - %s: DONE ******", - GetCurrentRun(), aDetector->GetName())); - UpdateShuttleStatus(AliShuttleStatus::kDone); - UpdateShuttleLogbook(fCurrentDetector, "DONE"); - } - } else + } + else if (success == 0) { Log("SHUTTLE", - Form("\t\t\t****** run %d - %s: PP ERROR ******", + Form("\t\t\t****** run %d - %s: ERROR ******", GetCurrentRun(), aDetector->GetName())); } - for (UInt_t iSys=0; iSys<3; iSys++) + for (UInt_t iSys=0; iSys<4; iSys++) { if (fFXSCalled[iSys]) fFXSlist[iSys].Clear(); } @@ -1488,6 +1675,7 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry) fFirstUnprocessed[iDet] = kFALSE; } } + SendMLRunInfo("Pending"); } } @@ -1497,7 +1685,7 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry) } //______________________________________________________________________________________________ -Bool_t AliShuttle::ProcessCurrentDetector() +Int_t AliShuttle::ProcessCurrentDetector() { // // Makes data retrieval just for a specific detector (fCurrentDetector). @@ -1509,16 +1697,39 @@ Bool_t AliShuttle::ProcessCurrentDetector() TString wd = gSystem->WorkingDirectory(); if (!CleanReferenceStorage(fCurrentDetector.Data())) - return kFALSE; + return 0; gSystem->ChangeDirectory(wd.Data()); - TMap* dcsMap = new TMap(); - // call preprocessor AliPreprocessor* aPreprocessor = dynamic_cast (fPreprocessorMap.GetValue(fCurrentDetector)); + // check if the preprocessor wants to process this run type + if (aPreprocessor->ProcessRunType() == kFALSE) + { + UpdateShuttleStatus(AliShuttleStatus::kSkipped); + UpdateShuttleLogbook(fCurrentDetector, "DONE"); + if (!UpdateTableSkippedCase(fCurrentDetector.Data())) + { + AliError(Form("Could not update FXS tables for run %d !", GetCurrentRun())); + } + Log(fCurrentDetector, Form("ProcessCurrentDetector - %s preprocessor is not interested in this run type", fCurrentDetector.Data())); + + return 2; + } + + // checking if OCDB is reachable + AliCDBEntry* testEntry = GetFromOCDB("SHUTTLE","GRP/CTP/DummyConfig"); + if (!testEntry){ + // OCDB is not accessible, going in OCDBError for current detector + AliError("OCDB Test entry not accessible"); + UpdateShuttleStatus(AliShuttleStatus::kOCDBError); + return 0; + } + + TMap* dcsMap = new TMap(); + aPreprocessor->Initialize(GetCurrentRun(), GetCurrentStartTime(), GetCurrentEndTime()); Bool_t processDCS = aPreprocessor->ProcessDCS(); @@ -1538,7 +1749,7 @@ Bool_t AliShuttle::ProcessCurrentDetector() UpdateShuttleStatus(AliShuttleStatus::kDCSStarted); UpdateShuttleStatus(AliShuttleStatus::kDCSError); delete dcsMap; - return kFALSE; + return 0; } else { UpdateShuttleStatus(AliShuttleStatus::kDCSStarted); @@ -1559,9 +1770,10 @@ Bool_t AliShuttle::ProcessCurrentDetector() TMap* aliasMap = 0; TMap* dpMap = 0; - + if (fConfig->GetDCSAliases(fCurrentDetector, iServ)->GetEntries() > 0) { + Log(fCurrentDetector, Form("Querying %d DCS aliases", fConfig->GetDCSAliases(fCurrentDetector, iServ)->GetEntries())); aliasMap = GetValueSet(host, port, fConfig->GetDCSAliases(fCurrentDetector, iServ), kAlias, multiSplit); @@ -1573,17 +1785,18 @@ Bool_t AliShuttle::ProcessCurrentDetector() " Sending mail to DCS experts!", host.Data())); UpdateShuttleStatus(AliShuttleStatus::kDCSError); - if (!SendMailToDCS()) + if (!SendMail(kDCSEMail)) Log("SHUTTLE", Form("ProcessCurrentDetector - " - "Could not send mail to DCS experts!")); + "Could not send mail to DCS experts!")); delete dcsMap; - return kFALSE; + return 0; } } if (fConfig->GetDCSDataPoints(fCurrentDetector, iServ)->GetEntries() > 0) { + Log(fCurrentDetector, Form("Querying %d DCS data points", fConfig->GetDCSDataPoints(fCurrentDetector, iServ)->GetEntries())); dpMap = GetValueSet(host, port, fConfig->GetDCSDataPoints(fCurrentDetector, iServ), kDP, multiSplit); @@ -1595,13 +1808,13 @@ Bool_t AliShuttle::ProcessCurrentDetector() " Sending mail to DCS experts!", host.Data())); UpdateShuttleStatus(AliShuttleStatus::kDCSError); - if (!SendMailToDCS()) + if (!SendMail(kDCSEMail)) Log("SHUTTLE", Form("ProcessCurrentDetector - " - "Could not send mail to DCS experts!")); + "Could not send mail to DCS experts!")); if (aliasMap) delete aliasMap; delete dcsMap; - return kFALSE; + return 0; } } @@ -1638,7 +1851,17 @@ Bool_t AliShuttle::ProcessCurrentDetector() // DCS Archive DB processing successful. Call Preprocessor! UpdateShuttleStatus(AliShuttleStatus::kPPStarted); + fFXSError = -1; // this variable is kTRUE after ::Process if an FXS error occured + UInt_t returnValue = aPreprocessor->Process(dcsMap); + + if (fFXSError!=-1) { + UpdateShuttleStatus(AliShuttleStatus::kFXSError); + SendMail(kFXSEMail, fFXSError); + dcsMap->DeleteAll(); + delete dcsMap; + return 0; + } if (returnValue > 0) // Preprocessor error! { @@ -1647,7 +1870,7 @@ Bool_t AliShuttle::ProcessCurrentDetector() UpdateShuttleStatus(AliShuttleStatus::kPPError); dcsMap->DeleteAll(); delete dcsMap; - return kFALSE; + return 0; } // preprocessor ok! @@ -1658,7 +1881,7 @@ Bool_t AliShuttle::ProcessCurrentDetector() dcsMap->DeleteAll(); delete dcsMap; - return kTRUE; + return 1; } //______________________________________________________________________________________________ @@ -1666,14 +1889,16 @@ void AliShuttle::CountOpenRuns() { // Query DAQ's Shuttle logbook and sends the number of open runs to ML + SendAlive(); + // check connection, in case connect - if (!Connect(3)) + if (!Connect(4)) return; TString sqlQuery; sqlQuery = Form("select count(*) from %s where shuttle_done=0", fConfig->GetShuttlelbTable()); - TSQLResult* aResult = fServer[3]->Query(sqlQuery); + TSQLResult* aResult = fServer[4]->Query(sqlQuery); if (!aResult) { AliError(Form("Can't execute query <%s>!", sqlQuery.Data())); return; @@ -1724,12 +1949,12 @@ Bool_t AliShuttle::QueryShuttleLogbook(const char* whereClause, entries.SetOwner(1); // check connection, in case connect - if (!Connect(3)) return kFALSE; + if (!Connect(4)) return kFALSE; TString sqlQuery; sqlQuery = Form("select * from %s %s order by run", fConfig->GetShuttlelbTable(), whereClause); - TSQLResult* aResult = fServer[3]->Query(sqlQuery); + TSQLResult* aResult = fServer[4]->Query(sqlQuery); if (!aResult) { AliError(Form("Can't execute query <%s>!", sqlQuery.Data())); return kFALSE; @@ -1744,7 +1969,7 @@ Bool_t AliShuttle::QueryShuttleLogbook(const char* whereClause, } // TODO Check field count! - const UInt_t nCols = 23; + const UInt_t nCols = 26; if (aResult->GetFieldCount() != (Int_t) nCols) { Log("SHUTTLE", "Invalid SQL result field number!"); delete aResult; @@ -1760,6 +1985,11 @@ Bool_t AliShuttle::QueryShuttleLogbook(const char* whereClause, if (!entry) continue; + // DA test mode flag + TString daTestModeString(aRow->GetField(2), aRow->GetFieldLength(2)); // field 2 = DA test mode flag + Bool_t daTestMode = (Bool_t)daTestModeString.Atoi(); + entry->SetDATestMode(daTestMode); + // loop on detectors for(UInt_t ii = 0; ii < nCols; ii++) entry->SetDetectorStatus(aResult->GetFieldName(ii), aRow->GetField(ii)); @@ -1780,13 +2010,13 @@ AliShuttleLogbookEntry* AliShuttle::QueryRunParameters(Int_t run) // // check connection, in case connect - if (!Connect(3)) + if (!Connect(4)) return 0; TString sqlQuery; sqlQuery.Form("select * from %s where run=%d", fConfig->GetDAQlbTable(), run); - TSQLResult* aResult = fServer[3]->Query(sqlQuery); + TSQLResult* aResult = fServer[4]->Query(sqlQuery); if (!aResult) { Log("SHUTTLE", Form("Can't execute query <%s>!", sqlQuery.Data())); return 0; @@ -1818,119 +2048,83 @@ AliShuttleLogbookEntry* AliShuttle::QueryRunParameters(Int_t run) for (Int_t ii = 0; ii < aResult->GetFieldCount(); ii++) entry->SetRunParameter(aResult->GetFieldName(ii), aRow->GetField(ii)); + delete aRow; + delete aResult; + UInt_t startTime = entry->GetStartTime(); UInt_t endTime = entry->GetEndTime(); + Bool_t ecsSuccess = entry->GetECSSuccess(); + TString runType = entry->GetRunType(); + TString tmpdaqstartTime = entry->GetRunParameter("DAQ_time_start"); + TString recordingFlagString = entry->GetRunParameter("GDCmStreamRecording"); + UInt_t recordingFlag = recordingFlagString.Atoi(); + UInt_t daqstartTime = tmpdaqstartTime.Atoi(); + + UInt_t now = time(0); + Int_t dcsDelay = fConfig->GetDCSDelay()+fConfig->GetDCSQueryOffset(); -// if (!startTime || !endTime || startTime > endTime) -// { -// Log("SHUTTLE", -// Form("QueryRunParameters - Invalid parameters for Run %d: startTime = %d, endTime = %d. Skipping!", -// run, startTime, endTime)); -// -// Log("SHUTTLE", Form("Marking SHUTTLE done for run %d", run)); -// fLogbookEntry = entry; -// if (!UpdateShuttleLogbook("shuttle_done")) -// { -// AliError(Form("Could not update logbook for run %d !", run)); -// } -// fLogbookEntry = 0; -// -// delete entry; -// delete aRow; -// delete aResult; -// return 0; -// } - - if (!startTime) - { - Log("SHUTTLE", - Form("QueryRunParameters - Invalid parameters for Run %d: " - "startTime = %d, endTime = %d. Skipping!", - run, startTime, endTime)); - - Log("SHUTTLE", Form("Marking SHUTTLE done for run %d", run)); - fLogbookEntry = entry; - if (!UpdateShuttleLogbook("shuttle_ignored")) - { - AliError(Form("Could not update logbook for run %d !", run)); - } - fLogbookEntry = 0; + Bool_t skip = kFALSE; - delete entry; - delete aRow; - delete aResult; - return 0; + // runs are processed if + // a) runType is PHYSICS and ecsSuccess is set + // b) runType is not PHYSICS and (ecsSuccess is set or DAQ_time_start is non-0) + // effectively this means that all runs are processed that started properly (ecsSucess behaviour is different for PHYSICS and non-PHYSICS runs (check with ECS!) + if (startTime != 0 && endTime != 0) { + if (endTime > startTime) { + if (endTime >= now - dcsDelay) { + Log("SHUTTLE", Form("Skipping run %d for now, because DCS buffer time is not yet expired", run)); + } else { + if ((runType == "PHYSICS" || runType == "STANDALONE") && recordingFlag == 0){ + Log("SHUTTLE", Form("QueryRunParameters - Run type for run %d is %s but the recording is OFF - Skipping!", run, runType.Data())); + skip = kTRUE; + } + else { + if (runType == "PHYSICS") { + if (ecsSuccess) { + return entry; + } else { + Log("SHUTTLE", Form("QueryRunParameters - Run type for run %d is PHYSICS but ECS success flag not set (Reason = %s) - Skipping!", run, entry->GetRunParameter("eor_reason"))); + skip = kTRUE; + } + } else { + if (ecsSuccess || daqstartTime > 0) { + if (ecsSuccess == kFALSE) + Log("SHUTTLE", Form("Processing run %d although in status ECS failure (Reason: %s), since run type != PHYSICS and DAQ_time_start != 0", run, entry->GetRunParameter("eor_reason"))); + return entry; + } else { + Log("SHUTTLE", Form("QueryRunParameters - Run type for run %d is %s, ECS success flag was not set (Reason = %s) and DAQ_time_start was NULL - Skipping!", run, runType.Data(), entry->GetRunParameter("eor_reason"))); + skip = kTRUE; + } + } + } + } + } else { + Log("SHUTTLE", Form("QueryRunParameters - Invalid parameters for run %d: startTime equal to endTime: %d %d - Skipping!", run, startTime, endTime)); + skip = kTRUE; + } + } else { + Log("SHUTTLE", Form("QueryRunParameters - Invalid parameters for Run %d: " + "startTime = %d, endTime = %d. Skipping (Shuttle won't be marked as DONE)!", + run, startTime, endTime)); } - if (startTime && !endTime) - { - // TODO Here we don't mark SHUTTLE done, because this may mean - //the run is still ongoing!! - Log("SHUTTLE", - Form("QueryRunParameters - Invalid parameters for Run %d: " - "startTime = %d, endTime = %d. Skipping (Shuttle won't be marked as DONE)!", - run, startTime, endTime)); - - //Log("SHUTTLE", Form("Marking SHUTTLE done for run %d", run)); - //fLogbookEntry = entry; - //if (!UpdateShuttleLogbook("shuttle_done")) - //{ - // AliError(Form("Could not update logbook for run %d !", run)); - //} - //fLogbookEntry = 0; - - delete entry; - delete aRow; - delete aResult; - return 0; - } - - if (startTime && endTime && (startTime > endTime)) + if (skip) { - Log("SHUTTLE", - Form("QueryRunParameters - Invalid parameters for Run %d: " - "startTime = %d, endTime = %d. Skipping!", - run, startTime, endTime)); - - Log("SHUTTLE", Form("Marking SHUTTLE done for run %d", run)); - fLogbookEntry = entry; - if (!UpdateShuttleLogbook("shuttle_ignored")) + Log("SHUTTLE", Form("Marking SHUTTLE skipped for run %d", run)); + fLogbookEntry = entry; + if (!UpdateShuttleLogbook("shuttle_skipped")) { AliError(Form("Could not update logbook for run %d !", run)); } - fLogbookEntry = 0; - - delete entry; - delete aRow; - delete aResult; - return 0; - } - - TString totEventsStr = entry->GetRunParameter("totalEvents"); - Int_t totEvents = totEventsStr.Atoi(); - if (totEvents < 1) - { - Log("SHUTTLE", - Form("QueryRunParameters - Run %d has 0 events - Skipping!", run)); - - Log("SHUTTLE", Form("Marking SHUTTLE done for run %d", run)); - fLogbookEntry = entry; - if (!UpdateShuttleLogbook("shuttle_ignored")) + if (!UpdateTableSkippedCase("ALL")) { - AliError(Form("Could not update logbook for run %d !", run)); + AliError(Form("Could not update FXS tables for run %d !", run)); } fLogbookEntry = 0; - - delete entry; - delete aRow; - delete aResult; - return 0; } - - delete aRow; - delete aResult; - - return entry; + + delete entry; + return 0; } //______________________________________________________________________________________________ @@ -1948,13 +2142,17 @@ TMap* AliShuttle::GetValueSet(const char* host, Int_t port, const TSeqCollection TMap* result = 0; if (type == kAlias) { - result = client.GetAliasValues(entries, GetCurrentStartTime(), - GetCurrentEndTime()); + //result = client.GetAliasValues(entries, GetCurrentStartTime()-offset, + // GetCurrentEndTime()+offset); + result = client.GetAliasValues(entries, GetStartTimeDCSQuery(), + GetEndTimeDCSQuery()); } else if (type == kDP) { - result = client.GetDPValues(entries, GetCurrentStartTime(), - GetCurrentEndTime()); + //result = client.GetDPValues(entries, GetCurrentStartTime()-offset, + // GetCurrentEndTime()+offset); + result = client.GetDPValues(entries, GetStartTimeDCSQuery(), + GetEndTimeDCSQuery()); } if (result == 0) @@ -1994,6 +2192,7 @@ const char* AliShuttle::GetFile(Int_t system, const char* detector, if (!Connect(system)) { Log(detector, Form("GetFile - Couldn't connect to %s FXS database", GetSystemName(system))); + fFXSError = system; return 0; } @@ -2005,7 +2204,7 @@ const char* AliShuttle::GetFile(Int_t system, const char* detector, TString whereClause = Form("run=%d and detector=\"%s\" and fileId=\"%s\"", GetCurrentRun(), detector, id); - if (system == kDAQ) + if (system == kDAQ || system == kDQM) { whereClause += Form(" and DAQsource=\"%s\"", source); } @@ -2016,7 +2215,6 @@ const char* AliShuttle::GetFile(Int_t system, const char* detector, else if (system == kHLT) { whereClause += Form(" and DDLnumbers=\"%s\"", source); - nFields = 3; } TString sqlQuery = Form("%s %s", sqlQueryStart.Data(), whereClause.Data()); @@ -2027,15 +2225,16 @@ const char* AliShuttle::GetFile(Int_t system, const char* detector, TSQLResult* aResult = 0; aResult = dynamic_cast (fServer[system]->Query(sqlQuery)); if (!aResult) { - Log(detector, Form("GetFileName - Can't execute SQL query to %s database for: id = %s, source = %s", + Log(detector, Form("GetFile - Can't execute SQL query to %s database for: id = %s, source = %s", GetSystemName(system), id, sourceName.Data())); + fFXSError = system; return 0; } - if(aResult->GetRowCount() == 0) + if (aResult->GetRowCount() == 0) { Log(detector, - Form("GetFileName - No entry in %s FXS db for: id = %s, source = %s", + Form("GetFile - No entry in %s FXS db for: id = %s, source = %s", GetSystemName(system), id, sourceName.Data())); delete aResult; return 0; @@ -2043,8 +2242,9 @@ const char* AliShuttle::GetFile(Int_t system, const char* detector, if (aResult->GetRowCount() > 1) { Log(detector, - Form("GetFileName - More than one entry in %s FXS db for: id = %s, source = %s", + Form("GetFile - More than one entry in %s FXS db for: id = %s, source = %s", GetSystemName(system), id, sourceName.Data())); + fFXSError = system; delete aResult; return 0; } @@ -2053,6 +2253,7 @@ const char* AliShuttle::GetFile(Int_t system, const char* detector, Log(detector, Form("GetFileName - Wrong field count in %s FXS db for: id = %s, source = %s", GetSystemName(system), id, sourceName.Data())); + fFXSError = system; delete aResult; return 0; } @@ -2060,8 +2261,9 @@ const char* AliShuttle::GetFile(Int_t system, const char* detector, TSQLRow* aRow = dynamic_cast (aResult->Next()); if (!aRow){ - Log(detector, Form("GetFileName - Empty set result in %s FXS db from query: id = %s, source = %s", + Log(detector, Form("GetFile - Empty set result in %s FXS db from query: id = %s, source = %s", GetSystemName(system), id, sourceName.Data())); + fFXSError = system; delete aResult; return 0; } @@ -2077,10 +2279,12 @@ const char* AliShuttle::GetFile(Int_t system, const char* detector, filePath.Data(), fileSize.Data(), fileChecksum.Data())); // retrieved file is renamed to make it unique - TString localFileName = Form("%s/%s_%d_process/%s_%s_%d_%s_%s.shuttle", - GetShuttleTempDir(), detector, GetCurrentRun(), + Int_t dir_lev1 = GetCurrentRun()/10000; + TString localFileName = Form("%s/%d/%d/%s_process/%s_%s_%d_%s_%s.shuttle", + GetShuttleTempDir(), dir_lev1, GetCurrentRun(), detector, GetSystemName(system), detector, GetCurrentRun(), id, sourceName.Data()); + Log("SHUTTLE",Form("file from FXS = %s",localFileName.Data())); // file retrieval from FXS @@ -2089,12 +2293,12 @@ const char* AliShuttle::GetFile(Int_t system, const char* detector, Bool_t result = kFALSE; // copy!! if successful TSystem::Exec returns 0 - while(nRetries++ < maxRetries) { + while (nRetries++ < maxRetries) { AliDebug(2, Form("Trying to copy file. Retry # %d", nRetries)); result = RetrieveFile(system, filePath.Data(), localFileName.Data()); - if(!result) + if (!result) { - Log(detector, Form("GetFileName - Copy of file %s from %s FXS failed", + Log(detector, Form("GetFile - Copy of file %s from %s FXS failed", filePath.Data(), GetSystemName(system))); continue; } @@ -2102,13 +2306,12 @@ const char* AliShuttle::GetFile(Int_t system, const char* detector, if (fileSize.Length()>0) { // compare filesize of local file with the one stored in the FXS DB - TString command=("stat --format=%s"); - Int_t sizeComp = gSystem->Exec(Form("%s %s |grep %s 2>&1 > /dev/null", - command.Data(), localFileName.Data(),fileSize.Data())); + Long_t size = -1; + Int_t sizeComp = gSystem->GetPathInfo(localFileName.Data(), 0, &size, 0, 0); - if ( sizeComp != 0) + if (sizeComp != 0 || size != fileSize.Atoi()) { - Log(detector, Form("GetFileName - size of file %s does not match with local copy!", + Log(detector, Form("GetFile - size of file %s does not match with local copy!", filePath.Data())); result = kFALSE; continue; @@ -2122,12 +2325,13 @@ const char* AliShuttle::GetFile(Int_t system, const char* detector, if (fileChecksum.Length()>0) { // compare md5sum of local file with the one stored in the FXS DB - Int_t md5Comp = gSystem->Exec(Form("md5sum %s |grep %s 2>&1 > /dev/null", + if(fileChecksum.Contains(' ')) fileChecksum.Resize(fileChecksum.First(' ')); + Int_t md5Comp = gSystem->Exec(Form("md5sum %s |grep %s > /dev/null 2> /dev/null", localFileName.Data(), fileChecksum.Data())); if (md5Comp != 0) { - Log(detector, Form("GetFileName - md5sum of file %s does not match with local copy!", + Log(detector, Form("GetFile - md5sum of file %s does not match with local copy!", filePath.Data())); result = kFALSE; continue; @@ -2139,7 +2343,11 @@ const char* AliShuttle::GetFile(Int_t system, const char* detector, if (result) break; } - if(!result) return 0; + if (!result) + { + fFXSError = system; + return 0; + } fFXSCalled[system]=kTRUE; TObjString *fileParams = new TObjString(Form("%s#!?!#%s", id, sourceName.Data())); @@ -2180,26 +2388,11 @@ Bool_t AliShuttle::RetrieveFile(UInt_t system, const char* fxsFileName, const ch } } - TString baseFXSFolder; - if (system == kDAQ) - { - baseFXSFolder = "FES/"; - } - else if (system == kDCS) - { - baseFXSFolder = ""; - } - else if (system == kHLT) - { - baseFXSFolder = "/opt/FXS/"; - } - - - TString command = Form("scp -oPort=%d -2 %s@%s:%s%s %s", + TString command = Form("scp -oPort=%d -2 %s@%s:%s/%s %s", fConfig->GetFXSPort(system), fConfig->GetFXSUser(system), fConfig->GetFXSHost(system), - baseFXSFolder.Data(), + fConfig->GetFXSBaseFolder(system), fxsFileName, localFileName); @@ -2245,11 +2438,12 @@ TList* AliShuttle::GetFileSources(Int_t system, const char* detector, const char if (!Connect(system)) { Log(detector, Form("GetFileSources - Couldn't connect to %s FXS database", GetSystemName(system))); + fFXSError = system; return NULL; } - TString sourceName = 0; - if (system == kDAQ) + TString sourceName = ""; + if (system == kDAQ || system == kDQM) { sourceName = "DAQsource"; } else if (system == kHLT) @@ -2272,6 +2466,7 @@ TList* AliShuttle::GetFileSources(Int_t system, const char* detector, const char if (!aResult) { Log(detector, Form("GetFileSources - Can't execute SQL query to %s database for id: %s", GetSystemName(system), id)); + fFXSError = system; return 0; } @@ -2326,7 +2521,7 @@ TList* AliShuttle::GetFileIDs(Int_t system, const char* detector, const char* so return NULL; } - TString sourceName = 0; + TString sourceName = ""; if (system == kDAQ) { sourceName = "DAQsource"; @@ -2390,11 +2585,24 @@ Bool_t AliShuttle::Connect(Int_t system) // // check connection: if already connected return - if(fServer[system] && fServer[system]->IsConnected()) return kTRUE; + + if(fServer[system] && fServer[system]->IsConnected()) { + // ping the server + if (fServer[system]->PingVerify()==kTRUE){ // connection is still alive + return kTRUE; + } + else{ + AliWarning(Form("Connection got lost to FXS database for %s. Closing and reconnecting.", + AliShuttleInterface::GetSystemName(system))); + fServer[system]->Close(); + delete fServer[system]; + fServer[system] = 0x0; + } + } TString dbHost, dbUser, dbPass, dbName; - if (system < 3) // FXS db servers + if (system < 4) // FXS db servers { dbHost = Form("mysql://%s:%d", fConfig->GetFXSdbHost(system), fConfig->GetFXSdbPort(system)); dbUser = fConfig->GetFXSdbUser(system); @@ -2409,8 +2617,8 @@ Bool_t AliShuttle::Connect(Int_t system) } fServer[system] = TSQLServer::Connect(dbHost.Data(), dbUser.Data(), dbPass.Data()); - if (!fServer[system] || !fServer[system]->IsConnected()) { - if(system < 3) + if (!fServer[system] || !fServer[system]->IsConnected()) { + if(system < 4) { AliError(Form("Can't establish connection to FXS database for %s", AliShuttleInterface::GetSystemName(system))); @@ -2433,8 +2641,11 @@ Bool_t AliShuttle::Connect(Int_t system) case kHLT: aResult = fServer[kHLT]->GetTables(dbName.Data()); break; + case kDQM: + aResult = fServer[kDQM]->GetTables(dbName.Data()); + break; default: - aResult = fServer[3]->GetTables(dbName.Data()); + aResult = fServer[4]->GetTables(dbName.Data()); break; } @@ -2451,7 +2662,7 @@ Bool_t AliShuttle::UpdateTable() Bool_t result = kTRUE; - for (UInt_t system=0; system<3; system++) + for (UInt_t system=0; system<4; system++) { if(!fFXSCalled[system]) continue; @@ -2484,7 +2695,7 @@ Bool_t AliShuttle::UpdateTable() const char* source = ((TObjString*) aFXSarray->At(1))->GetName(); TString whereClause; - if (system == kDAQ) + if (system == kDAQ || system == kDQM) { whereClause = Form("where run=%d and detector=\"%s\" and fileId=\"%s\" and DAQsource=\"%s\";", GetCurrentRun(), fCurrentDetector.Data(), fileId, source); @@ -2502,8 +2713,8 @@ Bool_t AliShuttle::UpdateTable() delete aFXSarray; - TString sqlQuery = Form("update %s set time_processed=%d %s", fConfig->GetFXSdbTable(system), - now.GetSec(), whereClause.Data()); + TString sqlQuery = Form("update %s set time_processed=%ld %s", fConfig->GetFXSdbTable(system), + (ULong_t)now.GetSec(), whereClause.Data()); AliDebug(2, Form("SQL query: \n%s",sqlQuery.Data())); @@ -2524,6 +2735,61 @@ Bool_t AliShuttle::UpdateTable() return result; } +//_______________________________________________________________________________ +Bool_t AliShuttle::UpdateTableSkippedCase(const char* detector) +{ + // + // Update FXS table filling time_processed field in all rows corresponding to current run and detector + // if detector = "ALL" update all detectors + // + + Bool_t result = kTRUE; + + TString detName(detector); + + for (UInt_t system=0; system<4; system++) + { + + // check connection, in case connect + if (!Connect(system)) + { + Log(fCurrentDetector, Form("UpdateTableSkippedCase - Couldn't connect to %s FXS database", GetSystemName(system))); + result = kFALSE; + continue; + } + + TTimeStamp now; // now + + // Loop on FXS list entries + TIter iter(&fFXSlist[system]); + + TString whereClause; + if (detName == "ALL") whereClause = Form("where run=%d and time_processed IS NULL;",GetCurrentRun()); + else whereClause = Form("where run=%d and detector=\"%s\" and time_processed IS NULL;",GetCurrentRun(), detector); + + //Log("SHUTTLE",Form(" whereClause = %s ",whereClause.Data())); + + TString sqlQuery = Form("update %s set time_processed=%ld %s", fConfig->GetFXSdbTable(system), + (ULong_t)now.GetSec(), whereClause.Data()); + + AliDebug(2, Form("SQL query: \n%s",sqlQuery.Data())); + + // Query execution + TSQLResult* aResult; + aResult = dynamic_cast (fServer[system]->Query(sqlQuery)); + if (!aResult) + { + Log("SHUTTLE", Form("UpdateTableSkippedCase - %s db: can't execute SQL query <%s>", + GetSystemName(system), sqlQuery.Data())); + result = kFALSE; + continue; + } + delete aResult; + + } + + return result; +} //______________________________________________________________________________________________ Bool_t AliShuttle::UpdateTableFailCase() { @@ -2533,7 +2799,7 @@ Bool_t AliShuttle::UpdateTableFailCase() Bool_t result = kTRUE; - for (UInt_t system=0; system<3; system++) + for (UInt_t system=0; system<4; system++) { // check connection, in case connect if (!Connect(system)) @@ -2552,8 +2818,8 @@ Bool_t AliShuttle::UpdateTableFailCase() GetCurrentRun(), fCurrentDetector.Data()); - TString sqlQuery = Form("update %s set time_processed=%d %s", fConfig->GetFXSdbTable(system), - now.GetSec(), whereClause.Data()); + TString sqlQuery = Form("update %s set time_processed=%ld %s", fConfig->GetFXSdbTable(system), + (ULong_t)now.GetSec(), whereClause.Data()); AliDebug(2, Form("SQL query: \n%s",sqlQuery.Data())); @@ -2582,30 +2848,31 @@ Bool_t AliShuttle::UpdateShuttleLogbook(const char* detector, const char* status // // check connection, in case connect - if(!Connect(3)){ + if(!Connect(4)){ Log("SHUTTLE", "UpdateShuttleLogbook - Couldn't connect to DAQ Logbook."); return kFALSE; } TString detName(detector); TString setClause; - if (detName == "shuttle_done" || detName == "shuttle_ignored") + if (detName == "shuttle_done" || detName == "shuttle_skipped") { setClause = "set shuttle_done=1"; - + if (detName == "shuttle_done") { - // Send the information to ML - TMonaLisaText mlStatus("SHUTTLE_status", "Done"); - - TList mlList; - mlList.Add(&mlStatus); - - TString mlID; - mlID.Form("%d", GetCurrentRun()); - fMonaLisa->SendParameters(&mlList, mlID); + if (TouchFile() != kTRUE) + { + SendMLRunInfo("Pending"); + return kFALSE; + } + + SendMLRunInfo("Done"); } - } else { + else + SendMLRunInfo("Skipped"); + } + else { TString statusStr(status); if(statusStr.Contains("done", TString::kIgnoreCase) || statusStr.Contains("failed", TString::kIgnoreCase)){ @@ -2627,7 +2894,7 @@ Bool_t AliShuttle::UpdateShuttleLogbook(const char* detector, const char* status // Query execution TSQLResult* aResult; - aResult = dynamic_cast (fServer[3]->Query(sqlQuery)); + aResult = dynamic_cast (fServer[4]->Query(sqlQuery)); if (!aResult) { Log("SHUTTLE", Form("UpdateShuttleLogbook - Can't execute query <%s>", sqlQuery.Data())); return kFALSE; @@ -2666,7 +2933,6 @@ UInt_t AliShuttle::GetCurrentEndTime() const return fLogbookEntry ? fLogbookEntry->GetEndTime() : 0; } - //______________________________________________________________________________________________ UInt_t AliShuttle::GetCurrentYear() const { @@ -2696,16 +2962,17 @@ const char* AliShuttle::GetLHCPeriod() const } //______________________________________________________________________________________________ -void AliShuttle::Log(const char* detector, const char* message) +void AliShuttle::Log(const char* detector, const char* message, UInt_t level) { // // Fill log string with a message // - - TString logRunDir = GetShuttleLogDir(); - if (GetCurrentRun() >=0) - logRunDir += Form("/%d", GetCurrentRun()); + TString logRunDir = GetShuttleLogDir(); + if (GetCurrentRun() >=0) { + Int_t logDir_lev1 = GetCurrentRun()/10000; + logRunDir += Form("/%d/%d", logDir_lev1, GetCurrentRun()); + } void* dir = gSystem->OpenDirectory(logRunDir.Data()); if (dir == NULL) { if (gSystem->mkdir(logRunDir.Data(), kTRUE)) { @@ -2717,12 +2984,12 @@ void AliShuttle::Log(const char* detector, const char* message) gSystem->FreeDirectory(dir); } - TString toLog = Form("%s (%d): %s - ", TTimeStamp(time(0)).AsString("s"), getpid(), detector); + TString toLog = Form("%s UTC (%d): %s - ", TTimeStamp(time(0)).AsString("s"), getpid(), detector); if (GetCurrentRun() >= 0) toLog += Form("run %d - ", GetCurrentRun()); toLog += Form("%s", message); - AliInfo(toLog.Data()); + AliLog::Message(level, toLog, MODULENAME(), ClassName(), FUNCTIONNAME(), __FILE__, __LINE__); // if we redirect the log output already to the file, leave here if (fOutputRedirected && strcmp(detector, "SHUTTLE") != 0) @@ -2756,8 +3023,9 @@ TString AliShuttle::GetLogFileName(const char* detector) const if (GetCurrentRun() >= 0) { - fileName.Form("%s/%d/%s_%d.log", GetShuttleLogDir(), GetCurrentRun(), - detector, GetCurrentRun()); + Int_t logDir_lev1 = GetCurrentRun()/10000; + fileName.Form("%s/%d/%d/%s.log", GetShuttleLogDir(), logDir_lev1, GetCurrentRun(), + detector); } else { fileName.Form("%s/%s.log", GetShuttleLogDir(), detector); } @@ -2799,7 +3067,6 @@ Bool_t AliShuttle::Collect(Int_t run) if (!fMonaLisa) fMonaLisa = new TMonaLisaWriter(fConfig->GetMonitorHost(), fConfig->GetMonitorTable()); - SendAlive(); CountOpenRuns(); TString whereClause("where shuttle_done=0"); @@ -2830,9 +3097,9 @@ Bool_t AliShuttle::Collect(Int_t run) { // query Shuttle logbook for earlier runs, check if some detectors are unprocessed, // flag them into fFirstUnprocessed array - TString whereClause(Form("where shuttle_done=0 and run < %d", run)); + TString whereClauseBis(Form("where shuttle_done=0 and run < %d", run)); TObjArray tmpLogbookEntries; - if (!QueryShuttleLogbook(whereClause, tmpLogbookEntries)) + if (!QueryShuttleLogbook(whereClauseBis, tmpLogbookEntries)) { Log("SHUTTLE", "Collect - Can't retrieve entries from Shuttle logbook"); return kFALSE; @@ -2983,7 +3250,7 @@ AliCDBEntry* AliShuttle::GetFromOCDB(const char* detector, const AliCDBPath& pat } //______________________________________________________________________________________________ -Bool_t AliShuttle::SendMail() +Bool_t AliShuttle::SendMail(EMailTarget target, Int_t system) { // // sends a mail to the subdetector expert in case of preprocessor error @@ -2992,37 +3259,80 @@ Bool_t AliShuttle::SendMail() if (fTestMode != kNone) return kTRUE; - if (!fConfig->SendMail()) return kTRUE; + if (!fConfig->SendMail()) + return kTRUE; + + if (target == kDCSEMail || target == kFXSEMail) { + if (!fFirstProcessing) + return kTRUE; + } + + Int_t runMode = (Int_t)fConfig->GetRunMode(); + TString tmpStr; + if (runMode == 0) tmpStr = " Nightly Test:"; + else tmpStr = " Data Taking:"; + void* dir = gSystem->OpenDirectory(GetShuttleLogDir()); + if (dir == NULL) + { + if (gSystem->mkdir(GetShuttleLogDir(), kTRUE)) + { + Log("SHUTTLE", Form("SendMail - Can't open directory <%s>", GetShuttleLogDir())); + return kFALSE; + } + } else { + gSystem->FreeDirectory(dir); + } + + // det experts in to TString to=""; - TIter iterExperts(fConfig->GetResponsibles(fCurrentDetector)); + TIter *iterExperts = 0; + if (target == kDCSEMail) { + iterExperts = new TIter(fConfig->GetAdmins(AliShuttleConfig::kAmanda)); + } + else if (target == kFXSEMail) { + iterExperts = new TIter(fConfig->GetAdmins(system)); + } + if (iterExperts) { + TObjString *anExpert=0; + while ((anExpert = (TObjString*) iterExperts->Next())) + { + to += Form("%s,", anExpert->GetName()); + } + delete iterExperts; + } + + // add subdetector experts + iterExperts = new TIter(fConfig->GetResponsibles(fCurrentDetector)); TObjString *anExpert=0; - while ((anExpert = (TObjString*) iterExperts.Next())) + while ((anExpert = (TObjString*) iterExperts->Next())) { to += Form("%s,", anExpert->GetName()); } + delete iterExperts; + if (to.Length() > 0) to.Remove(to.Length()-1); AliDebug(2, Form("to: %s",to.Data())); if (to.IsNull()) { - Log("SHUTTLE", "List of detector responsibles not set!"); + Log("SHUTTLE", Form("List of %d responsibles not set!", (Int_t) target)); return kFALSE; } - void* dir = gSystem->OpenDirectory(GetShuttleLogDir()); - if (dir == NULL) + // SHUTTLE responsibles in cc + TString cc=""; + TIter iterAdmins(fConfig->GetAdmins(AliShuttleConfig::kGlobal)); + TObjString *anAdmin=0; + while ((anAdmin = (TObjString*) iterAdmins.Next())) { - if (gSystem->mkdir(GetShuttleLogDir(), kTRUE)) - { - Log("SHUTTLE", Form("SendMail - Can't open directory <%s>", GetShuttleLogDir())); - return kFALSE; - } - - } else { - gSystem->FreeDirectory(dir); + cc += Form("%s,", anAdmin->GetName()); } + if (cc.Length() > 0) + cc.Remove(cc.Length()-1); + AliDebug(2, Form("cc: %s",to.Data())); + // mail body TString bodyFileName; bodyFileName.Form("%s/mail.body", GetShuttleLogDir()); gSystem->ExpandPathName(bodyFileName); @@ -3036,32 +3346,52 @@ Bool_t AliShuttle::SendMail() return kFALSE; } - TString cc=""; - TIter iterAdmins(fConfig->GetAdmins(AliShuttleConfig::kGlobal)); - TObjString *anAdmin=0; - while ((anAdmin = (TObjString*) iterAdmins.Next())) - { - cc += Form("%s,", anAdmin->GetName()); - } - if (cc.Length() > 0) - cc.Remove(cc.Length()-1); - AliDebug(2, Form("cc: %s",to.Data())); - TString subject = Form("%s Shuttle preprocessor FAILED in run %d (run type = %s)!", - fCurrentDetector.Data(), GetCurrentRun(), GetRunType()); - AliDebug(2, Form("subject: %s", subject.Data())); + TString subject; + TString body; + + if (target == kDCSEMail){ + subject = Form("%s CRITICAL Retrieval of data points for %s FAILED in run %d !", + tmpStr.Data(), fCurrentDetector.Data(), GetCurrentRun()); + AliDebug(2, Form("subject: %s", subject.Data())); + + body = Form("Dear DCS experts, \n\n"); + body += Form("SHUTTLE couldn\'t retrieve the data points for detector %s " + "in run %d!!\n\n", fCurrentDetector.Data(), GetCurrentRun()); + } + else if (target == kFXSEMail){ + subject = Form("%s CRITICAL FXS communication for %s FAILED in run %d !", + tmpStr.Data(), fCurrentDetector.Data(), GetCurrentRun()); + AliDebug(2, Form("subject: %s", subject.Data())); + TString sys; + if (system == kDAQ) sys="DAQ"; + else if (system == kDCS) sys="DCS"; + else if (system == kHLT) sys="HLT"; + else if (system == kDQM) sys="DQM"; + else return kFALSE; + body = Form("Dear %s FXS experts, \n\n",sys.Data()); + body += Form("SHUTTLE couldn\'t retrieve data from the FXS for detector %s " + "in run %d!!\n\n", fCurrentDetector.Data(), GetCurrentRun()); + body += Form("The contacted server was:\nDB: %s\nFXS:%s\n\n", fConfig->GetFXSdbHost(system), fConfig->GetFXSHost(system)); + } + else { + subject = Form("%s %s Shuttle preprocessor FAILED in run %d (run type = %s)!", + tmpStr.Data(), fCurrentDetector.Data(), GetCurrentRun(), GetRunType()); + AliDebug(2, Form("subject: %s", subject.Data())); + + body = Form("Dear %s expert(s), \n\n", fCurrentDetector.Data()); + body += Form("SHUTTLE just detected that your preprocessor " + "failed processing run %d (run type = %s)!!\n\n", + GetCurrentRun(), GetRunType()); + } - TString body = Form("Dear %s expert(s), \n\n", fCurrentDetector.Data()); - body += Form("SHUTTLE just detected that your preprocessor " - "failed processing run %d (run type = %s)!!\n\n", - GetCurrentRun(), GetRunType()); body += Form("Please check %s status on the SHUTTLE monitoring page: \n\n", fCurrentDetector.Data()); if (fConfig->GetRunMode() == AliShuttleConfig::kTest) { - body += Form("\thttp://pcalimonitor.cern.ch:8889/shuttle.jsp?time=168 \n\n"); + body += Form("\thttp://pcalimonitor.cern.ch/shuttle.jsp?time=24 \n\n"); } else { - body += Form("\thttp://pcalimonitor.cern.ch/shuttle.jsp?instance=PROD&time=168 \n\n"); + body += Form("\thttp://pcalimonitor.cern.ch/shuttle.jsp?instance=PROD&time=24 \n\n"); } @@ -3071,10 +3401,10 @@ Bool_t AliShuttle::SendMail() body += Form("Find the %s log for the current run on \n\n" - "\thttp://pcalishuttle01.cern.ch:8880/%s/%d/%s_%d.log \n\n", - fCurrentDetector.Data(), logFolder.Data(), GetCurrentRun(), - fCurrentDetector.Data(), GetCurrentRun()); - body += Form("The last 10 lines of %s log file are following:\n\n", fCurrentDetector.Data()); + "\thttp://pcalishuttle02.cern.ch/%s/%d/%d/%s.log \n\n", + fCurrentDetector.Data(), logFolder.Data(), GetCurrentRun()/10000, + GetCurrentRun(), fCurrentDetector.Data()); + body += Form("The last 15 lines of %s log file are following:\n\n", fCurrentDetector.Data()); AliDebug(2, Form("Body begin: %s", body.Data())); @@ -3082,9 +3412,9 @@ Bool_t AliShuttle::SendMail() mailBody.close(); mailBody.open(bodyFileName, ofstream::out | ofstream::app); - TString logFileName = Form("%s/%d/%s_%d.log", GetShuttleLogDir(), - GetCurrentRun(), fCurrentDetector.Data(), GetCurrentRun()); - TString tailCommand = Form("tail -n 10 %s >> %s", logFileName.Data(), bodyFileName.Data()); + TString logFileName = Form("%s/%d/%d/%s.log", GetShuttleLogDir(), + GetCurrentRun()/10000, GetCurrentRun(), fCurrentDetector.Data()); + TString tailCommand = Form("tail -n 15 %s >> %s", logFileName.Data(), bodyFileName.Data()); if (gSystem->Exec(tailCommand.Data())) { mailBody << Form("%s log file not found ...\n\n", fCurrentDetector.Data()); @@ -3113,168 +3443,187 @@ Bool_t AliShuttle::SendMail() return result == 0; } - //______________________________________________________________________________________________ -Bool_t AliShuttle::SendMailToDCS() +const char* AliShuttle::GetRunType() { // - // sends a mail to the DCS Amanda experts in case of DCS data point retrieval error + // returns run type read from "run type" logbook // - - if (fTestMode != kNone) - return kTRUE; - - if (!fConfig->SendMail()) return kTRUE; - void* dir = gSystem->OpenDirectory(GetShuttleLogDir()); - if (dir == NULL) - { - if (gSystem->mkdir(GetShuttleLogDir(), kTRUE)) - { - Log("SHUTTLE", Form("SendMailToDCS - Can't open directory <%s>", GetShuttleLogDir())); - return kFALSE; - } - - } else { - gSystem->FreeDirectory(dir); + if(!fLogbookEntry) { + AliError("No logbook entry!"); + return 0; } - TString bodyFileName; - bodyFileName.Form("%s/mail.body", GetShuttleLogDir()); - gSystem->ExpandPathName(bodyFileName); - - ofstream mailBody; - mailBody.open(bodyFileName, ofstream::out); + return fLogbookEntry->GetRunType(); +} - if (!mailBody.is_open()) - { - Log("SHUTTLE", Form("SendMailToDCS - Could not open mail body file %s", bodyFileName.Data())); - return kFALSE; - } +//______________________________________________________________________________________________ +Bool_t AliShuttle::GetHLTStatus() +{ + // Return HLT status (ON=1 OFF=0) + // Converts the HLT status from the mode string read in the run logbook (not just a bool) - TString to=""; - TIter iterExperts(fConfig->GetAdmins(AliShuttleConfig::kAmanda)); - TObjString *anExpert=0; - while ((anExpert = (TObjString*) iterExperts.Next())) - { - to += Form("%s,", anExpert->GetName()); + if(!fLogbookEntry) { + AliError("No logbook entry!"); + return 0; } - if (to.Length() > 0) - to.Remove(to.Length()-1); - AliDebug(2, Form("to: %s",to.Data())); - if (to.IsNull()) { - Log("SHUTTLE", "List of Amanda server administrators not set!"); + // TODO implement when HLTMode is inserted in run logbook + TString hltMode = fLogbookEntry->GetRunParameter("HLTmode"); + TSubString firstChar = hltMode(0,1); + AliDebug(2,Form("First char = %s ",firstChar.Data())); + if (firstChar == "A") { return kFALSE; } - - TString cc=""; - TIter iterAdmins(fConfig->GetAdmins(AliShuttleConfig::kGlobal)); - TObjString *anAdmin=0; - while ((anAdmin = (TObjString*) iterAdmins.Next())) - { - cc += Form("%s,", anAdmin->GetName()); + else if ((firstChar == "B") || (firstChar == "C") || (firstChar == "D") || (firstChar == "E")) { + return kTRUE; } - if (cc.Length() > 0) - cc.Remove(cc.Length()-1); - AliDebug(2, Form("cc: %s",to.Data())); + else { + Log("SHUTTLE","Unexpected HLT mode! Returning 0...."); + return kFALSE; + } +} - TString subject = Form("Retrieval of data points for %s FAILED in run %d !", - fCurrentDetector.Data(), GetCurrentRun()); - AliDebug(2, Form("subject: %s", subject.Data())); +//______________________________________________________________________________________________ +const char* AliShuttle::GetTriggerConfiguration() +{ + // Receives the trigger configuration from the DAQ logbook for the current run + + // check connection, if needed reconnect + if (!Connect(4)) + return 0; - TString body = Form("Dear DCS experts, \n\n"); - body += Form("SHUTTLE couldn\'t retrieve the data points for detector %s " - "in run %d!!\n\n", fCurrentDetector.Data(), GetCurrentRun()); - body += Form("Please check %s status on the SHUTTLE monitoring page: \n\n", - fCurrentDetector.Data()); - if (fConfig->GetRunMode() == AliShuttleConfig::kTest) + TString sqlQuery; + sqlQuery.Form("SELECT configFile FROM logbook_trigger_config WHERE run = %d", GetCurrentRun()); + TSQLResult* result = fServer[4]->Query(sqlQuery); + if (!result) { - body += Form("\thttp://pcalimonitor.cern.ch:8889/shuttle.jsp?time=168 \n\n"); - } else { - body += Form("\thttp://pcalimonitor.cern.ch/shuttle.jsp?instance=PROD?time=168 \n\n"); + Log("SHUTTLE", Form("ERROR: Can't execute query <%s>!", sqlQuery.Data())); + return 0; } - - TString logFolder = "logs"; - if (fConfig->GetRunMode() == AliShuttleConfig::kProd) - logFolder += "_PROD"; + if (result->GetRowCount() == 0) + { + Log("SHUTTLE", "WARNING: Trigger configuration not found in logbook_trigger_config"); + delete result; + return 0; + } - body += Form("Find the %s log for the current run on \n\n" - "\thttp://pcalishuttle01.cern.ch:8880/%s/%d/%s_%d.log \n\n", - fCurrentDetector.Data(), logFolder.Data(), GetCurrentRun(), - fCurrentDetector.Data(), GetCurrentRun()); - body += Form("The last 10 lines of %s log file are following:\n\n", fCurrentDetector.Data()); - - AliDebug(2, Form("Body begin: %s", body.Data())); - - mailBody << body.Data(); - mailBody.close(); - mailBody.open(bodyFileName, ofstream::out | ofstream::app); - - TString logFileName = Form("%s/%d/%s_%d.log", GetShuttleLogDir(), GetCurrentRun(), - fCurrentDetector.Data(), GetCurrentRun()); - TString tailCommand = Form("tail -n 10 %s >> %s", logFileName.Data(), bodyFileName.Data()); - if (gSystem->Exec(tailCommand.Data())) + TSQLRow* row = result->Next(); + if (!row) { - mailBody << Form("%s log file not found ...\n\n", fCurrentDetector.Data()); + Log("SHUTTLE", "ERROR: Could not receive logbook_trigger_config data"); + delete result; + return 0; } - TString endBody = Form("------------------------------------------------------\n\n"); - endBody += Form("In case of problems please contact the SHUTTLE core team.\n\n"); - endBody += "Please do not answer this message directly, it is automatically generated.\n\n"; - endBody += "Greetings,\n\n \t\t\tthe SHUTTLE\n"; - - AliDebug(2, Form("Body end: %s", endBody.Data())); - - mailBody << endBody.Data(); - - mailBody.close(); - - // send mail! - TString mailCommand = Form("mail -s \"%s\" -c %s %s < %s", - subject.Data(), - cc.Data(), - to.Data(), - bodyFileName.Data()); - AliDebug(2, Form("mail command: %s", mailCommand.Data())); - - Bool_t result = gSystem->Exec(mailCommand.Data()); - - return result == 0; + // static, so that pointer remains valid when it is returned to the calling class + static TString triggerConfig(row->GetField(0)); + + delete row; + row = 0; + + delete result; + result = 0; + + Log("SHUTTLE", Form("Found trigger configuration: %s", triggerConfig.Data())); + + return triggerConfig; } //______________________________________________________________________________________________ -const char* AliShuttle::GetRunType() +const char* AliShuttle::GetCTPTimeParams() { - // - // returns run type read from "run type" logbook - // + // Receives the CTP time parameters from the DAQ logbook for the current run + + // check connection, if needed reconnect + if (!Connect(4)) + return 0; - if(!fLogbookEntry) { - AliError("No logbook entry!"); + TString sqlQuery; + sqlQuery.Form("SELECT alignmentFile FROM logbook_trigger_config WHERE run = %d", GetCurrentRun()); + TSQLResult* result = fServer[4]->Query(sqlQuery); + if (!result) + { + Log("SHUTTLE", Form("ERROR: Can't execute query <%s>!", sqlQuery.Data())); + return 0; + } + + if (result->GetRowCount() == 0) + { + Log("SHUTTLE", "WARNING: CTP time params not found in logbook_trigger_config"); + delete result; + return 0; + } + + TSQLRow* row = result->Next(); + if (!row) + { + Log("SHUTTLE", "ERROR: Could not receive logbook_trigger_config data"); + delete result; return 0; } - return fLogbookEntry->GetRunType(); + // static, so that pointer remains valid when it is returned to the calling class + static TString triggerTimeParams(row->GetField(0)); + + delete row; + row = 0; + + delete result; + result = 0; + + Log("SHUTTLE", Form("Found trigger time parameters: %s", triggerTimeParams.Data())); + + return triggerTimeParams; } //______________________________________________________________________________________________ -Bool_t AliShuttle::GetHLTStatus() +const char* AliShuttle::GetTriggerDetectorMask() { - // Return HLT status (ON=1 OFF=0) - // Converts the HLT status from the status string read in the run logbook (not just a bool) + // Receives the trigger detector mask from DAQ logbook + + // check connection, if needed reconnect + if (!Connect(4)) + return 0; - if(!fLogbookEntry) { - AliError("No logbook entry!"); + TString sqlQuery; + sqlQuery.Form("SELECT BIN(BIT_OR(inputDetectorMask)) from logbook_trigger_clusters WHERE run = %d;", GetCurrentRun()); + TSQLResult* result = fServer[4]->Query(sqlQuery); + if (!result) + { + Log("SHUTTLE", Form("ERROR: Can't execute query <%s>!", sqlQuery.Data())); + return 0; + } + + if (result->GetRowCount() == 0) + { + Log("SHUTTLE", "ERROR: Trigger Detector Mask not found in logbook_trigger_clusters"); + delete result; + return 0; + } + + TSQLRow* row = result->Next(); + if (!row) + { + Log("SHUTTLE", "ERROR: Could not receive logbook_trigger_clusters data"); + delete result; return 0; } - // TODO implement when HLTStatus is inserted in run logbook - //TString hltStatus = fLogbookEntry->GetRunParameter("HLTStatus"); - //if(hltStatus == "OFF") {return kFALSE}; - - return kTRUE; + // static, so that pointer remains valid when it is returned to the calling class + static TString triggerDetectorMask(row->GetField(0)); + + delete row; + row = 0; + + delete result; + result = 0; + + Log("SHUTTLE", Form("Found Trigger Detector Mask: %s", triggerDetectorMask.Data())); + + return triggerDetectorMask; } //______________________________________________________________________________________________ @@ -3296,3 +3645,168 @@ void AliShuttle::SetShuttleLogDir(const char* logDir) fgkShuttleLogDir = gSystem->ExpandPathName(logDir); } +//______________________________________________________________________________________________ +Bool_t AliShuttle::TouchFile() +{ + // + // touching a file on the grid if run has been DONE + // + + if (!gGrid) + { + Log("SHUTTLE",Form("No TGrid connection estabilished!")); + Log("SHUTTLE",Form("Could not touch file for run %i",GetCurrentRun())); + return kFALSE; + } + + TString dir; + dir.Form("%s%d/SHUTTLE_DONE", fConfig->GetAlienPath(), GetCurrentYear()); + // checking whether directory for touch command exists + TString commandLs; + commandLs.Form("ls %s",dir.Data()); + TGridResult *resultLs = dynamic_cast(gGrid->Command(commandLs)); + if (!resultLs){ + Log("SHUTTLE",Form("No result for %s command, returning without touching",commandLs.Data())); + return kFALSE; + } + TMap *mapLs = dynamic_cast(resultLs->At(0)); + if (!mapLs){ + Log("SHUTTLE",Form("No map for %s command, returning without touching",commandLs.Data())); + delete resultLs; + resultLs = 0x0; + return kFALSE; + } + TObjString *valueLsPath = dynamic_cast(mapLs->GetValue("path")); + if (!valueLsPath || (valueLsPath->GetString()).CompareTo(dir)!=1){ + Log("SHUTTLE",Form("No directory %s found, creating it",dir.Data())); + + // creating the directory + + Bool_t boolMkdir = gGrid->Mkdir(dir.Data()); + if (!boolMkdir) { + Log("SHUTTLE",Form("Impossible to create dir %s in alien catalogue for run %i!",dir.Data(),GetCurrentRun())); + delete resultLs; + resultLs = 0x0; + return kFALSE; + } + Log("SHUTTLE",Form("Directory %s successfully created in alien catalogue for run %i",dir.Data(),GetCurrentRun())); + } + else { + Log("SHUTTLE",Form("Directory %s correctly found for run %i",dir.Data(),GetCurrentRun())); + } + + delete resultLs; + resultLs = 0x0; + + TString command; + command.Form("touch %s/%i", dir.Data(), GetCurrentRun()); + Log("SHUTTLE", Form("Creating entry in file catalog: %s", command.Data())); + TGridResult *resultTouch = dynamic_cast(gGrid->Command(command)); + if (!resultTouch){ + Log("SHUTTLE",Form("No result for touching command, returning without touching for run %i",GetCurrentRun())); + return kFALSE; + } + TMap *mapTouch = dynamic_cast(resultTouch->At(0)); + if (!mapTouch){ + Log("SHUTTLE",Form("No map for touching command, returning without touching for run %i",GetCurrentRun())); + delete resultTouch; + resultTouch = 0x0; + return kFALSE; + } + TObjString *valueTouch = dynamic_cast(mapTouch->GetValue("__result__")); + if (!valueTouch){ + Log("SHUTTLE",Form("No value for \"__result__\" key set in the map for touching command, returning without touching for run %i",GetCurrentRun())); + delete resultTouch; + resultTouch = 0x0; + return kFALSE; + } + if (valueTouch->GetString()!="1"){ + Log("SHUTTLE",Form("Failing the touching command, returning without touching for run %i",GetCurrentRun())); + delete resultTouch; + resultTouch = 0x0; + return kFALSE; + } + delete resultTouch; + resultTouch = 0x0; + Log("SHUTTLE", "Sucessfully touched the file"); + return kTRUE; +} +//______________________________________________________________________________________________ +UInt_t AliShuttle::GetStartTimeDCSQuery() +{ + // Return Start Time for the DCS query + // + // The call is delegated to AliShuttleInterface + + return GetCurrentStartTime()-fConfig->GetDCSQueryOffset(); +} +//______________________________________________________________________________________________ +UInt_t AliShuttle::GetEndTimeDCSQuery() +{ + // Return End Time for the DCS query + // + // The call is delegated to AliShuttleInterface + + return GetCurrentEndTime()+fConfig->GetDCSQueryOffset(); +} +//______________________________________________________________________________________________ +void AliShuttle::SendMLFromDet(const char* value) +{ + // + // Sending an information coming from the current detector to ML + // + + TMonaLisaText mlText(Form("%s_RunCondition", fCurrentDetector.Data()), value); + + TList mlList; + mlList.Add(&mlText); + + TString mlID; + mlID.Form("%d", GetCurrentRun()); + fMonaLisa->SendParameters(&mlList, mlID); + + return; +} +//______________________________________________________________________________________________ +TString* AliShuttle::GetLTUConfig(const char* det) +{ + // + // Getting ltuFineDelay1, ltuFineDelay2, ltuBCDelay for detector det from logbook_detectors table in logbook + // + + if (!Connect(4)) + return 0; + + TString sqlQuery; + sqlQuery.Form("select LTUFineDelay1, LTUFineDelay2, LTUBCDelayAdd from logbook_detectors WHERE run_number = %d and detector = \"%s\";", GetCurrentRun(),det); + + TSQLResult* result = fServer[4]->Query(sqlQuery); + if (!result){ + Log("SHUTTLE","ERROR: No result found for the LTU configuration query"); + return 0x0; + } + if (result->GetRowCount() == 0){ + Log("SHUTTLE",Form("ERROR: LTU configuration not found in logbook_detectors for detector %s, returning null pointer",det)); + delete result; + return 0x0; + } + if (result->GetFieldCount() != 3){ + Log("SHUTTLE",Form("ERROR: not all the required fields are there for the LTU configuration for detector %s (only %d found), returning a null pointer",det, result->GetFieldCount())); + delete result; + return 0x0; + } + TSQLRow* row = result->Next(); + if (!row){ + Printf("ERROR: Could not receive logbook_detectors data, returning null pointer"); + delete result; + return 0x0; + } + TString* ltuConfigString = new TString[3]; + + ltuConfigString[0] = row->GetField(0); + ltuConfigString[1] = row->GetField(1); + ltuConfigString[2] = row->GetField(2); + + return ltuConfigString; + +}