Logo ROOT   6.30.04
Reference Guide
 All Namespaces Files Pages
TCondor.cxx
Go to the documentation of this file.
1 // @(#)root/proof:$Id$
2 // Author: Maarten Ballintijn 06/12/03
3 
4 /*************************************************************************
5  * Copyright (C) 1995-2001, Rene Brun and Fons Rademakers. *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 //////////////////////////////////////////////////////////////////////////
13 // //
14 // TCondor //
15 // //
16 // Interface to the Condor system. TCondor provides a (partial) API for //
17 // querying and controlling the Condor system, including experimental //
18 // extensions like COD (computing on demand) //
19 // //
20 //////////////////////////////////////////////////////////////////////////
21 
22 #include <stdlib.h>
23 
24 #include "TCondor.h"
25 #include "TList.h"
26 #include "TSystem.h"
27 #include "TObjString.h"
28 #include "TRegexp.h"
29 #include "TProofDebug.h"
30 #include "Riostream.h"
31 #include "TEnv.h"
32 #include "TClass.h"
33 
34 ClassImp(TCondorSlave);
35 ClassImp(TCondor);
36 
37 
38 ////////////////////////////////////////////////////////////////////////////////
39 /// Create Condor interface object. Uses Condor apps since there is no
40 /// API yet.
41 
42 TCondor::TCondor(const char *pool) : fPool(pool), fState(kFree)
43 {
44  fClaims = new TList;
45 
46  // Setup Condor
47 
48  TString condorHome = gEnv->GetValue("Proof.CondorHome", (char*)0);
49  if (condorHome != "") {
50  TString path = gSystem->Getenv("PATH");
51  path = condorHome + "/bin:" + path;
52  gSystem->Setenv("PATH",path);
53  }
54 
55  TString condorConf = gEnv->GetValue("Proof.CondorConfig", (char*)0);
56  if (condorConf != "") {
57  gSystem->Setenv("CONDOR_CONFIG",condorConf);
58  }
59 
60  char *loc = gSystem->Which(gSystem->Getenv("PATH"), "condor_cod",
61  kExecutePermission);
62 
63  if (loc) {
64  fValid = kTRUE;
65  delete [] loc;
66  } else {
67  fValid = kFALSE;
68  }
69 }
70 
71 
72 ////////////////////////////////////////////////////////////////////////////////
73 /// Cleanup Condor interface.
74 
75 TCondor::~TCondor()
76 {
77  PDB(kCondor,1) Info("~TCondor","fState %d", fState );
78 
79  if (fState != kFree) {
80  Release();
81  }
82  delete fClaims;
83 }
84 
85 
86 ////////////////////////////////////////////////////////////////////////////////
87 /// Print master status
88 
89 void TCondor::Print(Option_t * opt) const
90 {
91  std::cout << "OBJ: " << IsA()->GetName()
92  << "\tPool: \"" << fPool << "\""
93  << "\tState: " << fState << std::endl;
94  fClaims->Print(opt);
95 }
96 
97 
98 ////////////////////////////////////////////////////////////////////////////////
99 /// Claim a VirtualMachine for PROOF usage.
100 
101 TCondorSlave *TCondor::ClaimVM(const char *vm, const char *cmd)
102 {
103  Int_t port = 0;
104 
105  TString claimCmd = Form("condor_cod request -name %s -timeout 10 2>>%s/condor.proof.%d",
106  vm, gSystem->TempDirectory(), gSystem->GetUid() );
107 
108  PDB(kCondor,2) Info("ClaimVM","command: %s", claimCmd.Data());
109  FILE *pipe = gSystem->OpenPipe(claimCmd, "r");
110 
111  if (!pipe) {
112  SysError("ClaimVM","cannot run command: %s", claimCmd.Data());
113  return 0;
114  }
115 
116  TString claimId;
117  TString line;
118  while (line.Gets(pipe)) {
119  PDB(kCondor,3) Info("ClaimVM","line = %s", line.Data());
120 
121  if (line.BeginsWith("ClaimId = \"")) {
122  line.Remove(0, line.Index("\"")+1);
123  line.Chop(); // remove trailing "
124  claimId = line;
125  PDB(kCondor,1) Info("ClaimVM","claim = '%s'", claimId.Data());
126  TRegexp r("[0-9]*$");
127  TString num = line(r);
128  port = 37000 + atoi(num.Data());
129  PDB(kCondor,1) Info("ClaimVM","port = %d", port);
130  }
131  }
132 
133  Int_t r = gSystem->ClosePipe(pipe);
134  if (r) {
135  Error("ClaimVM","command: %s returned %d", claimCmd.Data(), r);
136  return 0;
137  } else {
138  PDB(kCondor,1) Info("ClaimVM","command: %s returned %d", claimCmd.Data(), r);
139  }
140 
141  TString jobad("jobad");
142  FILE *jf = gSystem->TempFileName(jobad);
143 
144  if (jf == 0) return 0;
145 
146  TString str(cmd);
147  str.ReplaceAll("$(Port)", Form("%d", port));
148  fputs(str, jf);
149 
150  fclose(jf);
151 
152  TString activateCmd = Form("condor_cod activate -id '%s' -jobad %s",
153  claimId.Data(), jobad.Data() );
154 
155  PDB(kCondor,2) Info("ClaimVM","command: %s", activateCmd.Data());
156  pipe = gSystem->OpenPipe(activateCmd, "r");
157 
158  if (!pipe) {
159  SysError("ClaimVM","cannot run command: %s", activateCmd.Data());
160  return 0;
161  }
162 
163  while (line.Gets(pipe)) {
164  PDB(kCondor,3) Info("ClaimVM","Activate: line = %s", line.Data());
165  }
166 
167  r = gSystem->ClosePipe(pipe);
168  if (r) {
169  Error("ClaimVM","command: %s returned %d", activateCmd.Data(), r);
170  } else {
171  PDB(kCondor,1) Info("ClaimVM","command: %s returned %d", activateCmd.Data(), r);
172  }
173 
174  gSystem->Unlink(jobad);
175 
176  // TODO: get info at the start for all nodes ...
177  TCondorSlave *claim = new TCondorSlave;
178  claim->fClaimID = claimId;
179  TString node(vm);
180  node = node.Remove(0, node.Index("@")+1);
181  claim->fHostname = node;
182  claim->fPort = port;
183  claim->fPerfIdx = 100; //set performance index to 100 by default
184  claim->fImage = node; //set image to hostname by default
185 
186  return claim;
187 }
188 
189 
190 ////////////////////////////////////////////////////////////////////////////////
191 /// Get the names of the virtual machines in the pool.
192 /// Return a TList of TObjString or 0 in case of failure
193 
194 TList *TCondor::GetVirtualMachines() const
195 {
196  TString poolopt = fPool ? Form("-pool %s", fPool.Data()) : "";
197  TString cmd = Form("condor_status %s -format \"%%s\\n\" Name", poolopt.Data());
198 
199  PDB(kCondor,2) Info("GetVirtualMachines","command: %s", cmd.Data());
200 
201  FILE *pipe = gSystem->OpenPipe(cmd, "r");
202 
203  if (!pipe) {
204  SysError("GetVirtualMachines","cannot run command: %s", cmd.Data());
205  return 0;
206  }
207 
208  TString line;
209  TList *l = new TList;
210  while (line.Gets(pipe)) {
211  PDB(kCondor,3) Info("GetVirtualMachines","line = %s", line.Data());
212  if (line != "") l->Add(new TObjString(line));
213  }
214 
215  Int_t r = gSystem->ClosePipe(pipe);
216  if (r) {
217  delete l;
218  Error("GetVirtualMachines","command: %s returned %d", cmd.Data(), r);
219  return 0;
220  } else {
221  PDB(kCondor,1) Info("GetVirtualMachines","command: %s returned %d", cmd.Data(), r);
222  }
223 
224  return l;
225 }
226 
227 
228 ////////////////////////////////////////////////////////////////////////////////
229 /// Claim n virtual machines
230 /// This function figures out the image and performance index before returning
231 /// the list of condor slaves
232 
233 TList *TCondor::Claim(Int_t n, const char *cmd)
234 {
235  if (fState != kFree) {
236  Error("Claim","not in state Free");
237  return 0;
238  }
239 
240  TList *vms = GetVirtualMachines();
241  TIter next(vms);
242  TObjString *vm;
243  for(Int_t i=0; i < n && (vm = (TObjString*) next()) != 0; i++ ) {
244  TCondorSlave *claim = ClaimVM(vm->GetName(), cmd);
245  if (claim != 0) {
246  if ( !GetVmInfo(vm->GetName(), claim->fImage, claim->fPerfIdx) ) {
247  // assume vm is gone
248  delete claim;
249  } else {
250  fClaims->Add(claim);
251  fState = kActive;
252  }
253  }
254  }
255 
256  return fClaims;
257 }
258 
259 
260 ////////////////////////////////////////////////////////////////////////////////
261 /// Claim virtual machine with name vmname
262 /// This function does not figure out the image and performance index before
263 /// returning the condor slave
264 
265 TCondorSlave *TCondor::Claim(const char *vmname, const char *cmd)
266 {
267  if (fState != kFree && fState != kActive) {
268  Error("Claim","not in state Free or Active");
269  return 0;
270  }
271 
272  TCondorSlave *claim = ClaimVM(vmname, cmd);
273  if (claim != 0) {
274  fClaims->Add(claim);
275  fState = kActive;
276  }
277 
278  return claim;
279 }
280 
281 
282 ////////////////////////////////////////////////////////////////////////////////
283 /// Set the state of workers
284 
285 Bool_t TCondor::SetState(EState state)
286 {
287  PDB(kCondor,1) Info("SetState","state: %s (%lld)",
288  state == kSuspended ? "kSuspended" : "kActive", Long64_t(gSystem->Now()));
289  TIter next(fClaims);
290  TCondorSlave *claim;
291  while((claim = (TCondorSlave*) next()) != 0) {
292  TString cmd = Form("condor_cod %s -id '%s'",
293  state == kSuspended ? "suspend" : "resume",
294  claim->fClaimID.Data());
295 
296  PDB(kCondor,2) Info("SetState","command: %s", cmd.Data());
297  FILE *pipe = gSystem->OpenPipe(cmd, "r");
298 
299  if (!pipe) {
300  SysError("SetState","cannot run command: %s", cmd.Data());
301  return kFALSE;
302  }
303 
304  TString line;
305  while (line.Gets(pipe)) {
306  PDB(kCondor,3) Info("SetState","line = %s", line.Data());
307  }
308 
309  Int_t r = gSystem->ClosePipe(pipe);
310  if (r) {
311  Error("SetState","command: %s returned %d", cmd.Data(), r);
312  return kFALSE;
313  } else {
314  PDB(kCondor,1) Info("SetState","command: %s returned %d", cmd.Data(), r);
315  }
316  }
317 
318  fState = state;
319  return kTRUE;
320 }
321 
322 
323 ////////////////////////////////////////////////////////////////////////////////
324 /// Suspend worker
325 
326 Bool_t TCondor::Suspend()
327 {
328  if (fState != kActive) {
329  Error("Suspend","not in state Active");
330  return kFALSE;
331  }
332 
333  return SetState(kSuspended);
334 }
335 
336 
337 ////////////////////////////////////////////////////////////////////////////////
338 /// Resume worker
339 
340 Bool_t TCondor::Resume()
341 {
342  if (fState != kSuspended) {
343  Error("Suspend","not in state Suspended");
344  return kFALSE;
345  }
346 
347  return SetState(kActive);
348 }
349 
350 
351 ////////////////////////////////////////////////////////////////////////////////
352 /// Release worker
353 
354 Bool_t TCondor::Release()
355 {
356  if (fState == kFree) {
357  Error("Suspend","not in state Active or Suspended");
358  return kFALSE;
359  }
360 
361  TCondorSlave *claim;
362  while((claim = (TCondorSlave*) fClaims->First()) != 0) {
363  TString cmd = Form("condor_cod release -id '%s'", claim->fClaimID.Data());
364 
365  PDB(kCondor,2) Info("SetState","command: %s", cmd.Data());
366  FILE *pipe = gSystem->OpenPipe(cmd, "r");
367 
368  if (!pipe) {
369  SysError("Release","cannot run command: %s", cmd.Data());
370  return kFALSE;
371  }
372 
373  TString line;
374  while (line.Gets(pipe)) {
375  PDB(kCondor,3) Info("Release","line = %s", line.Data());
376  }
377 
378  Int_t r = gSystem->ClosePipe(pipe);
379  if (r) {
380  Error("Release","command: %s returned %d", cmd.Data(), r);
381  return kFALSE;
382  } else {
383  PDB(kCondor,1) Info("Release","command: %s returned %d", cmd.Data(), r);
384  }
385 
386  fClaims->Remove(claim);
387  delete claim;
388  }
389 
390  fState = kFree;
391  return kTRUE;
392 }
393 
394 
395 ////////////////////////////////////////////////////////////////////////////////
396 /// Get info about worker status
397 
398 Bool_t TCondor::GetVmInfo(const char *vm, TString &image, Int_t &perfidx) const
399 {
400  TString cmd = Form("condor_status -format \"%%d:\" Mips -format \"%%s\\n\" FileSystemDomain "
401  "-const 'Name==\"%s\"'", vm);
402 
403  PDB(kCondor,2) Info("GetVmInfo","command: %s", cmd.Data());
404  FILE *pipe = gSystem->OpenPipe(cmd, "r");
405 
406  if (!pipe) {
407  SysError("GetVmInfo","cannot run command: %s", cmd.Data());
408  return kFALSE;
409  }
410 
411  TString line;
412  while (line.Gets(pipe)) {
413  PDB(kCondor,3) Info("GetVmInfo","line = %s", line.Data());
414  if (line != "") {
415  TString amips = line(TRegexp("^[0-9]*"));
416  perfidx = atoi(amips);
417  image = line(TRegexp("[^:]+$"));
418  break;
419  }
420  }
421 
422  Int_t r = gSystem->ClosePipe(pipe);
423  if (r) {
424  Error("GetVmInfo","command: %s returned %d", cmd.Data(), r);
425  return kFALSE;
426  } else {
427  PDB(kCondor,1) Info("GetVmInfo","command: %s returned %d", cmd.Data(), r);
428  }
429 
430  return kTRUE;
431 }
432 
433 
434 ////////////////////////////////////////////////////////////////////////////////
435 /// Get image of the worker
436 
437 TString TCondor::GetImage(const char *host) const
438 {
439  TString cmd = Form("condor_status -direct %s -format \"Image:%%s\\n\" "
440  "FileSystemDomain", host);
441 
442  PDB(kCondor,2) Info("GetImage","command: %s", cmd.Data());
443 
444  FILE *pipe = gSystem->OpenPipe(cmd, "r");
445 
446  if (!pipe) {
447  SysError("GetImage","cannot run command: %s", cmd.Data());
448  return "";
449  }
450 
451  TString image;
452  TString line;
453  while (line.Gets(pipe)) {
454  PDB(kCondor,3) Info("GetImage","line = %s", line.Data());
455  if (line != "") {
456  image = line(TRegexp("[^:]+$"));
457  break;
458  }
459  }
460 
461  Int_t r = gSystem->ClosePipe(pipe);
462  if (r) {
463  Error("GetImage","command: %s returned %d", cmd.Data(), r);
464  return "";
465  } else {
466  PDB(kCondor,1) Info("GetImage","command: %s returned %d", cmd.Data(), r);
467  }
468 
469  return image;
470 }
471 
472 
473 ////////////////////////////////////////////////////////////////////////////////
474 /// Print worker status
475 
476 void TCondorSlave::Print(Option_t * /*opt*/ ) const
477 {
478  std::cout << "OBJ: " << IsA()->GetName()
479  << " " << fHostname << ":" << fPort
480  << " Perf: " << fPerfIdx
481  << " Image: " << fImage << std::endl;
482 }