Skip to content

Commit af5dfe9

Browse files
xiaozhouXk8s-ci-robot
authored andcommitted
optimize arena logs for show failed chief pod log (kubeflow#96)
1 parent 2c33a08 commit af5dfe9

File tree

2 files changed

+48
-63
lines changed

2 files changed

+48
-63
lines changed

cmd/arena/commands/trainer_mpi.go

Lines changed: 24 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -276,13 +276,10 @@ func (tt *MPIJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob,
276276

277277
func (tt *MPIJobTrainer) getTrainingJob(name, namespace string) (TrainingJob, error) {
278278
var (
279-
chiefPod v1.Pod
280279
mpijob v1alpha1.MPIJob
281280
)
282281

283282
// 1. Get the batchJob of training Job
284-
pods := []v1.Pod{}
285-
286283
mpijobList, err := tt.mpijobClient.KubeflowV1alpha1().MPIJobs(namespace).List(metav1.ListOptions{
287284
LabelSelector: fmt.Sprintf("release=%s", name),
288285
})
@@ -308,18 +305,7 @@ func (tt *MPIJobTrainer) getTrainingJob(name, namespace string) (TrainingJob, er
308305
return nil, err
309306
}
310307

311-
for _, item := range podList.Items {
312-
if !tt.isMPIPod(name, namespace, item) {
313-
continue
314-
}
315-
if tt.isChiefPod(item) {
316-
chiefPod = item
317-
}
318-
319-
// for non-job pod, add it into the pod list
320-
pods = append(pods, item)
321-
log.Debugf("add pod %v to pods", item)
322-
}
308+
pods, chiefPod := getPodsOfMPIJob(tt, podList.Items)
323309

324310
return &MPIJob{
325311
mpijob: mpijob,
@@ -335,12 +321,9 @@ func (tt *MPIJobTrainer) getTrainingJob(name, namespace string) (TrainingJob, er
335321
func (tt *MPIJobTrainer) getTrainingJobFromCache(name, ns string) (TrainingJob, error) {
336322

337323
var (
338-
chiefPod v1.Pod
339324
mpijob v1alpha1.MPIJob
340325
)
341326

342-
pods := []v1.Pod{}
343-
344327
// 1. Find the batch job
345328
for _, item := range allMPIjobs {
346329
if tt.isMPIJob(name, ns, item) {
@@ -350,20 +333,7 @@ func (tt *MPIJobTrainer) getTrainingJobFromCache(name, ns string) (TrainingJob,
350333
}
351334

352335
// 2. Find the pods, and determine the pod of the job
353-
for _, item := range allPods {
354-
355-
if !tt.isMPIPod(name, ns, item) {
356-
continue
357-
}
358-
if tt.isChiefPod(item) {
359-
chiefPod = item
360-
}
361-
362-
// for non-job pod, add it into the pod list
363-
pods = append(pods, item)
364-
log.Debugf("add pod %v to pods", item)
365-
366-
}
336+
pods, chiefPod := getPodsOfMPIJob(tt, allPods)
367337

368338
return &MPIJob{
369339
mpijob: mpijob,
@@ -444,3 +414,25 @@ func isMPIJobFailed(status v1alpha1.MPIJobStatus) bool {
444414
func isMPIJobPending(status v1alpha1.MPIJobStatus) bool {
445415
return false
446416
}
417+
418+
419+
func getPodsOfMPIJob(tt *MPIJobTrainer, podList []v1.Pod) (pods []v1.Pod, chiefPod v1.Pod) {
420+
pods = []v1.Pod{}
421+
for _, item := range podList {
422+
if !tt.isMPIPod(name, namespace, item) {
423+
continue
424+
}
425+
if tt.isChiefPod(item) && item.CreationTimestamp.After(chiefPod.CreationTimestamp.Time) {
426+
// If there are some failed chiefPod, and the new chiefPod haven't started, set the latest failed pod as chief pod
427+
if chiefPod.Name != "" && item.Status.Phase == v1.PodPending {
428+
continue
429+
}
430+
chiefPod = item
431+
}
432+
433+
// for non-job pod, add it into the pod list
434+
pods = append(pods, item)
435+
log.Debugf("add pod %v to pods", item)
436+
}
437+
return pods, chiefPod
438+
}

cmd/arena/commands/trainer_tensorflow.go

Lines changed: 24 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,6 @@ func (tt *TensorFlowJobTrainer) GetTrainingJob(name, namespace string) (tj Train
269269

270270
func (tt *TensorFlowJobTrainer) getTrainingJob(name, namespace string) (TrainingJob, error) {
271271
var (
272-
chiefPod v1.Pod
273272
tfjob tfv1alpha2.TFJob
274273
)
275274

@@ -303,19 +302,7 @@ func (tt *TensorFlowJobTrainer) getTrainingJob(name, namespace string) (Training
303302
if err != nil {
304303
return nil, err
305304
}
306-
307-
for _, item := range podList.Items {
308-
if !tt.isTensorFlowPod(name, namespace, item) {
309-
continue
310-
}
311-
if tt.isChiefPod(tfjob, item) {
312-
chiefPod = item
313-
}
314-
315-
// for non-job pod, add it into the pod list
316-
pods = append(pods, item)
317-
log.Debugf("add pod %v to pods", item)
318-
}
305+
pods, chiefPod := getPodsOfTFJob(tt, tfjob, podList.Items)
319306

320307
return &TensorFlowJob{
321308
tfjob: tfjob,
@@ -331,12 +318,9 @@ func (tt *TensorFlowJobTrainer) getTrainingJob(name, namespace string) (Training
331318
func (tt *TensorFlowJobTrainer) getTrainingJobFromCache(name, ns string) (TrainingJob, error) {
332319

333320
var (
334-
chiefPod v1.Pod
335321
tfjob tfv1alpha2.TFJob
336322
)
337323

338-
pods := []v1.Pod{}
339-
340324
// 1. Find the batch job
341325
for _, item := range allTfjobs {
342326
if tt.isTensorFlowJob(name, ns, item) {
@@ -345,21 +329,9 @@ func (tt *TensorFlowJobTrainer) getTrainingJobFromCache(name, ns string) (Traini
345329
}
346330
}
347331

348-
// 2. Find the pods, and determine the pod of the job
349-
for _, item := range allPods {
350-
351-
if !tt.isTensorFlowPod(name, ns, item) {
352-
continue
353-
}
354-
if tt.isChiefPod(tfjob, item) {
355-
chiefPod = item
356-
}
357-
358-
// for non-job pod, add it into the pod list
359-
pods = append(pods, item)
360-
log.Debugf("add pod %v to pods", item)
361332

362-
}
333+
// 2. Find the pods, and determine the pod of the job
334+
pods, chiefPod := getPodsOfTFJob(tt, tfjob, allPods)
363335

364336
return &TensorFlowJob{
365337
tfjob: tfjob,
@@ -492,3 +464,24 @@ func checkStatus(status tfv1alpha2.TFJobStatus) tfv1alpha2.TFJobConditionType {
492464
}
493465
return t
494466
}
467+
468+
func getPodsOfTFJob(tt *TensorFlowJobTrainer, tfjob tfv1alpha2.TFJob, podList []v1.Pod) (pods []v1.Pod, chiefPod v1.Pod) {
469+
pods = []v1.Pod{}
470+
for _, item := range podList {
471+
if !tt.isTensorFlowPod(name, namespace, item) {
472+
continue
473+
}
474+
if tt.isChiefPod(tfjob, item) && item.CreationTimestamp.After(chiefPod.CreationTimestamp.Time) {
475+
// If there are some failed chiefPod, and the new chiefPod haven't started, set the latest failed pod as chief pod
476+
if chiefPod.Name != "" && item.Status.Phase == v1.PodPending {
477+
continue
478+
}
479+
chiefPod = item
480+
}
481+
482+
// for non-job pod, add it into the pod list
483+
pods = append(pods, item)
484+
log.Debugf("add pod %v to pods", item)
485+
}
486+
return pods, chiefPod
487+
}

0 commit comments

Comments
 (0)