Skip to content

Commit 73aa7cc

Browse files
bhou2bhou
andauthored
Force updating the job status to KILLED when killing a job that has a connected agent but no response observer (#1192)
Co-authored-by: bhou <[email protected]>
1 parent c6c81dc commit 73aa7cc

File tree

2 files changed

+64
-11
lines changed

2 files changed

+64
-11
lines changed

genie-web/src/main/java/com/netflix/genie/web/agent/apis/rpc/v4/endpoints/GRpcJobKillServiceImpl.java

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -154,15 +154,42 @@ public void killJob(
154154
= this.parkedJobKillResponseObservers.remove(jobId);
155155

156156
if (responseObserver == null) {
157-
log.error("Job {} not killed. Expected local agent connection not found", jobId);
158-
throw new GenieServerException(
159-
"Job " + jobId + " not killed. Expected local agent connection not found."
157+
// This might happen when the agent has gone but its status is not updated
158+
// In this case, we force updating the job status to KILLED.
159+
log.warn("Tried to kill Job {}, but expected local agent connection not found. "
160+
+ "Trying to force updating the job status to {}",
161+
jobId,
162+
JobStatus.KILLED
160163
);
161-
}
162-
responseObserver.onNext(JobKillRegistrationResponse.newBuilder().build());
163-
responseObserver.onCompleted();
164+
try {
165+
this.persistenceService.updateJobStatus(jobId, currentJobStatus, JobStatus.KILLED, reason);
166+
log.info("Succeeded to force updating the status of Job {} to {}",
167+
jobId,
168+
JobStatus.KILLED
169+
);
170+
} catch (final GenieInvalidStatusException e) {
171+
log.error(
172+
"Failed to force updating the status of Job {} to {} "
173+
+ "due to current status not being expected {}",
174+
jobId,
175+
JobStatus.KILLED,
176+
currentJobStatus
177+
);
178+
throw e;
179+
} catch (final NotFoundException e) {
180+
log.error(
181+
"Failed to force updating the status of Job {} to {} due to job not found",
182+
jobId,
183+
JobStatus.KILLED
184+
);
185+
throw new GenieJobNotFoundException(e);
186+
}
187+
} else {
188+
responseObserver.onNext(JobKillRegistrationResponse.newBuilder().build());
189+
responseObserver.onCompleted();
164190

165-
log.info("Agent notified for killing job {}", jobId);
191+
log.info("Agent notified for killing job {}", jobId);
192+
}
166193
} else {
167194
// Agent is running somewhere else try to forward the request
168195
final String hostname = this.agentRoutingService

genie-web/src/test/groovy/com/netflix/genie/web/agent/apis/rpc/v4/endpoints/GRpcJobKillServiceImplSpec.groovy

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -145,16 +145,42 @@ class GRpcJobKillServiceImplSpec extends Specification {
145145
0 * this.agentRoutingService.isAgentConnectionLocal(this.jobId)
146146
noExceptionThrown()
147147

148-
when: "The job is active, the agent is connected, the job is local but no observer"
148+
when: "The job is active, the agent is connected, the job is local but no observer, and force updating job status succeeded"
149149
this.serviceSpy.killJob(this.jobId, this.reason, this.servletRequest)
150150

151-
then: "Correct exception is thrown"
151+
then: "The database is updated and no exception is thrown"
152152
1 * this.persistenceService.getJobStatus(this.jobId) >> JobStatus.CLAIMED
153-
0 * this.persistenceService.updateJobStatus(_ as String, _ as JobStatus, _ as JobStatus, _ as String)
153+
1 * this.persistenceService.updateJobStatus(_ as String, _ as JobStatus, _ as JobStatus, _ as String)
154154
1 * this.agentRoutingService.isAgentConnectionLocal(this.jobId) >> true
155155
0 * this.responseObserver.onNext(_ as JobKillRegistrationResponse)
156156
0 * this.responseObserver.onCompleted()
157-
thrown(GenieServerException)
157+
noExceptionThrown()
158+
159+
when: "The job is active, the agent is connected, the job is local but no observer, and current job status is invalid for updating"
160+
this.serviceSpy.killJob(this.jobId, this.reason, this.servletRequest)
161+
162+
then: "The database is not updated and GenieInvalidStatusException is thrown"
163+
1 * this.persistenceService.getJobStatus(this.jobId) >> JobStatus.CLAIMED
164+
1 * this.persistenceService.updateJobStatus(this.jobId, JobStatus.CLAIMED, JobStatus.KILLED, this.reason) >> {
165+
throw new GenieInvalidStatusException()
166+
}
167+
1 * this.agentRoutingService.isAgentConnectionLocal(this.jobId) >> true
168+
0 * this.responseObserver.onNext(_ as JobKillRegistrationResponse)
169+
0 * this.responseObserver.onCompleted()
170+
thrown(GenieInvalidStatusException)
171+
172+
when: "The job is active, the agent is connected, the job is local but no observer, and the job is not found"
173+
this.serviceSpy.killJob(this.jobId, this.reason, this.servletRequest)
174+
175+
then: "The database is not updated and GenieJobNotFoundException is thrown"
176+
1 * this.persistenceService.getJobStatus(this.jobId) >> JobStatus.CLAIMED
177+
1 * this.persistenceService.updateJobStatus(this.jobId, JobStatus.CLAIMED, JobStatus.KILLED, this.reason) >> {
178+
throw new NotFoundException()
179+
}
180+
1 * this.agentRoutingService.isAgentConnectionLocal(this.jobId) >> true
181+
0 * this.responseObserver.onNext(_ as JobKillRegistrationResponse)
182+
0 * this.responseObserver.onCompleted()
183+
thrown(GenieJobNotFoundException)
158184

159185
when: "The job is active, the agent is connected, and there is an observer"
160186
this.serviceSpy.registerForKillNotification(this.request, this.responseObserver)

0 commit comments

Comments
 (0)