Skip to content

Commit

Permalink
Add more exit codes to Frame state waiting (#1131)
Browse files Browse the repository at this point in the history
* Add more exit codes to Frame state waiting

When determining the frame state cuebot
needs to make sure that certain exit statuses
put the frame state back into waiting, this
will help save time for users when a frame
fails for host hardware issues.

(cherry picked from commit cbeda9387b09a8713bb76d9075fdee72c3c795f1)

* Add more exit codes to Frame state waiting

* Removed unused method updateFrameHostDown

* Removed deprecated oracle FrameDaoJdbc file

* Remove log debugging from FrameCompleteHandler

* Reverting, adding updateFrameHostDown FrameDao method

* Verion bump, add missing interface checkRetries method

Co-authored-by: Diego Tavares da Silva <[email protected]>
  • Loading branch information
roulaoregan-spi and DiegoTavares committed May 10, 2022
1 parent 870b0b5 commit 02cb67f
Show file tree
Hide file tree
Showing 8 changed files with 128 additions and 44 deletions.
17 changes: 17 additions & 0 deletions cuebot/src/main/java/com/imageworks/spcue/dao/FrameDao.java
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,15 @@ public interface FrameDao {
*/
public FrameDetail findLongestFrame(JobInterface job);

/**
* Checks to see how many retries a frame has. If that number
* is greater than or equal to the jobs max retries, the frame
* is marked as dead.
*
* @param frame
*/
void checkRetries(FrameInterface frame);

/**
* Batch inserts a frameSet of frames.
*
Expand Down Expand Up @@ -194,6 +203,14 @@ boolean updateFrameStopped(FrameInterface frame, FrameState state, int exitStatu
*/
boolean updateFrameCleared(FrameInterface frame);

/**
* Sets a frame to an unreserved waiting state.
*
* @param frame
* @return
*/
boolean updateFrameHostDown(FrameInterface frame);

/**
* Returns a DispatchFrame object from the frame's uinique ID.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import java.sql.Timestamp;
import java.util.Optional;

import org.springframework.dao.DataAccessException;
import org.springframework.jdbc.core.RowMapper;
import org.springframework.jdbc.core.support.JdbcDaoSupport;

Expand Down Expand Up @@ -116,7 +117,7 @@ public boolean updateFrameStopped(FrameInterface frame, FrameState state,
frame.getVersion()) == 1;
}

private static final String UPDATE_FRAME_CLEARED =
private static final String UPDATE_FRAME_REASON =
"UPDATE "+
"frame "+
"SET " +
Expand All @@ -132,17 +133,26 @@ public boolean updateFrameStopped(FrameInterface frame, FrameState state,
"(SELECT proc.pk_frame FROM " +
"proc WHERE proc.pk_frame=?)";

@Override
public boolean updateFrameCleared(FrameInterface frame) {
private int updateFrame(FrameInterface frame, int exitStatus) {

int result = getJdbcTemplate().update(
UPDATE_FRAME_CLEARED,
FrameState.WAITING.toString(),
Dispatcher.EXIT_STATUS_FRAME_CLEARED,
frame.getFrameId(),
frame.getFrameId());
UPDATE_FRAME_REASON,
FrameState.WAITING.toString(),
exitStatus,
frame.getFrameId(),
frame.getFrameId());

return result > 0;
return result;
}

@Override
public boolean updateFrameHostDown(FrameInterface frame) {
return updateFrame(frame, Dispatcher.EXIT_STATUS_DOWN_HOST) > 0;
}

@Override
public boolean updateFrameCleared(FrameInterface frame) {
return updateFrame(frame, Dispatcher.EXIT_STATUS_FRAME_CLEARED) > 0;
}

private static final String UPDATE_FRAME_STARTED =
Expand Down Expand Up @@ -196,31 +206,45 @@ public boolean updateFrameCleared(FrameInterface frame) {
"WHERE " +
"pk_frame = ? " +
"AND " +
"int_exit_status NOT IN (?,?,?) ";
"int_exit_status NOT IN (?,?,?,?,?,?,?) ";

@Override
public void updateFrameStarted(VirtualProc proc, FrameInterface frame) {

lockFrameForUpdate(frame, FrameState.WAITING);

int result = getJdbcTemplate().update(UPDATE_FRAME_STARTED,
FrameState.RUNNING.toString(), proc.hostName, proc.coresReserved,
proc.memoryReserved, proc.gpusReserved, proc.gpuMemoryReserved, frame.getFrameId(),
FrameState.WAITING.toString(), frame.getVersion());

if (result == 0) {
String error_msg = "the frame " +
frame + " was updated by another thread.";
throw new FrameReservationException(error_msg);
try {
int result = getJdbcTemplate().update(UPDATE_FRAME_STARTED,
FrameState.RUNNING.toString(), proc.hostName, proc.coresReserved,
proc.memoryReserved, proc.gpusReserved, proc.gpuMemoryReserved, frame.getFrameId(),
FrameState.WAITING.toString(), frame.getVersion());
if (result == 0) {
String error_msg = "the frame " +
frame + " was updated by another thread.";
throw new FrameReservationException(error_msg);
}
} catch (DataAccessException e) {
/*
* This usually happens when the folder's max cores
* limit has exceeded
*/
throw new FrameReservationException(e.getCause());
}

/*
* Frames that were killed via nimby or hardware errors not attributed to
* the software do not increment the retry counter.
* the software do not increment the retry counter. Like failed launch,
* orphaned frame, failed kill or down host.
*/
getJdbcTemplate().update(UPDATE_FRAME_RETRIES,
frame.getFrameId(), -1, FrameExitStatus.SKIP_RETRY_VALUE,
Dispatcher.EXIT_STATUS_FRAME_CLEARED);
try {
getJdbcTemplate().update(UPDATE_FRAME_RETRIES,
frame.getFrameId(), -1, FrameExitStatus.SKIP_RETRY_VALUE,
FrameExitStatus.FAILED_LAUNCH_VALUE, Dispatcher.EXIT_STATUS_FRAME_CLEARED,
Dispatcher.EXIT_STATUS_FRAME_ORPHAN, Dispatcher.EXIT_STATUS_FAILED_KILL,
Dispatcher.EXIT_STATUS_DOWN_HOST);
} catch (DataAccessException e) {
throw new FrameReservationException(e.getCause());
}
}

private static final String UPDATE_FRAME_FIXED =
Expand Down Expand Up @@ -657,6 +681,21 @@ public FrameInterface findFrame(JobInterface job, String name) {
FRAME_MAPPER, name, job.getJobId());
}

@Override
public void checkRetries(FrameInterface frame) {
int max_retries = getJdbcTemplate().queryForObject(
"SELECT int_max_retries FROM job WHERE pk_job=?", Integer.class,
frame.getJobId());

if (getJdbcTemplate().queryForObject(
"SELECT int_retries FROM frame WHERE pk_frame=?", Integer.class,
frame.getFrameId()) >= max_retries) {
getJdbcTemplate().update(
"UPDATE frame SET str_state=? WHERE pk_frame=?",
FrameState.DEAD.toString(), frame.getFrameId());
}
}

private static final String UPDATE_FRAME_STATE =
"UPDATE " +
"frame "+
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,11 @@
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;

import org.apache.log4j.Logger;
import org.springframework.dao.EmptyResultDataAccessException;
import org.springframework.transaction.annotation.Propagation;
import org.springframework.transaction.annotation.Transactional;

import com.imageworks.spcue.AllocationInterface;
import com.imageworks.spcue.DispatchFrame;
import com.imageworks.spcue.DispatchHost;
import com.imageworks.spcue.FacilityInterface;
import com.imageworks.spcue.FrameDetail;
import com.imageworks.spcue.FrameInterface;
import com.imageworks.spcue.GroupInterface;
import com.imageworks.spcue.HostInterface;
Expand All @@ -43,6 +39,11 @@
import com.imageworks.spcue.ShowInterface;
import com.imageworks.spcue.StrandedCores;
import com.imageworks.spcue.VirtualProc;
import org.apache.log4j.Logger;
import org.springframework.dao.EmptyResultDataAccessException;
import org.springframework.transaction.annotation.Propagation;
import org.springframework.transaction.annotation.Transactional;

import com.imageworks.spcue.dao.BookingDao;
import com.imageworks.spcue.dao.DispatcherDao;
import com.imageworks.spcue.dao.FrameDao;
Expand Down Expand Up @@ -522,13 +523,29 @@ public void lostProc(VirtualProc proc, String reason, int exitStatus) {
frameDao.updateFrameCheckpointState(f, CheckpointState.DISABLED);
/*
* If the proc has a frame, stop the frame. Frames
* can only be stopped that are running, so if the frame
* is not running it will remain untouched.
* can only be stopped that are running.
*/
if (frameDao.updateFrameStopped(f,
FrameState.WAITING, exitStatus)) {
updateUsageCounters(proc, exitStatus);
}
/*
* If the frame is not running, check if frame is in dead state,
* frames that died due to host going down should be put back
* into WAITING status.
*/
else {
FrameDetail frameDetail = frameDao.getFrameDetail(f);
if ((frameDetail.state == FrameState.DEAD) &&
(Dispatcher.EXIT_STATUS_DOWN_HOST == exitStatus)) {
if (frameDao.updateFrameHostDown(f)) {
logger.info("update frame " + f.getFrameId() +
"to WAITING status for down host");
}
}
}
} else {
logger.info("Frame ID is NULL, not updating Frame state");
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@


package com.imageworks.spcue.dispatcher;

import java.util.List;

import com.imageworks.spcue.DispatchFrame;
Expand All @@ -34,6 +33,9 @@

public interface Dispatcher {

// Maximum number of core points that can be assigned to a frame
public static final int CORE_POINTS_RESERVED_MAX = 2400;

// The default number of core points assigned to a frame, if no core
// point value is specified
public static final int CORE_POINTS_RESERVED_DEFAULT = 100;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -321,17 +321,17 @@ public void handlePostFrameCompleteOperations(VirtualProc proc,
}

/*
* An exit status of NO_RETRY (256) indicates that the frame could
* An exit status of FAILED_LAUNCH (256) indicates that the frame could
* not be launched due to some unforeseen unrecoverable error that
* is not checked when the launch command is given. The most common
* cause of this is when the job log directory is removed before the
* job is complete.
*
* Frames that return a 256 are not automatically retried.
* Frames that return a 256 are put Frame back into WAITING status
*/

else if (report.getExitStatus() == FrameExitStatus.NO_RETRY_VALUE) {
logger.info("unbooking " + proc + " frame status was no-retry.");
else if (report.getExitStatus() == FrameExitStatus.FAILED_LAUNCH_VALUE) {
logger.info("unbooking " + proc + " frame status was failed frame launch.");
unbookProc = true;
}

Expand Down Expand Up @@ -544,7 +544,8 @@ else if (report.getHost().getNimbyLocked()) {
* @param report
* @return
*/
public static final FrameState determineFrameState(DispatchJob job, LayerDetail layer, DispatchFrame frame, FrameCompleteReport report) {
public static final FrameState determineFrameState(DispatchJob job, LayerDetail layer,
DispatchFrame frame, FrameCompleteReport report) {

if (EnumSet.of(FrameState.WAITING, FrameState.EATEN).contains(
frame.state)) {
Expand All @@ -567,17 +568,25 @@ else if (frame.state.equals(FrameState.DEAD)) {
|| (job.maxRetries != 0 && report.getExitSignal() == 119)) {
report = FrameCompleteReport.newBuilder(report).setExitStatus(FrameExitStatus.SKIP_RETRY_VALUE).build();
newState = FrameState.WAITING;
// exemption code 256
} else if ((report.getExitStatus() == FrameExitStatus.FAILED_LAUNCH_VALUE ||
report.getExitSignal() == FrameExitStatus.FAILED_LAUNCH_VALUE) &&
(frame.retries < job.maxRetries)) {
report = FrameCompleteReport.newBuilder(report).setExitStatus(report.getExitStatus()).build();
newState = FrameState.WAITING;
} else if (job.autoEat) {
newState = FrameState.EATEN;
// ETC Time out and LLU timeout
} else if (layer.timeout_llu != 0 && report.getFrame().getLluTime() != 0 && lastUpdate > (layer.timeout_llu -1)) {
} else if (layer.timeout_llu != 0 && report.getFrame().getLluTime() != 0
&& lastUpdate > (layer.timeout_llu -1)) {
newState = FrameState.DEAD;
} else if (layer.timeout != 0 && report.getRunTime() > layer.timeout * 60) {
newState = FrameState.DEAD;
} else if (report.getRunTime() > Dispatcher.FRAME_TIME_NO_RETRY) {
newState = FrameState.DEAD;
} else if (frame.retries >= job.maxRetries) {
if (!(report.getExitStatus() == Dispatcher.EXIT_STATUS_MEMORY_FAILURE || report.getExitSignal() == Dispatcher.EXIT_STATUS_MEMORY_FAILURE))
if (!(report.getExitStatus() == Dispatcher.EXIT_STATUS_MEMORY_FAILURE
|| report.getExitSignal() == Dispatcher.EXIT_STATUS_MEMORY_FAILURE))
newState = FrameState.DEAD;
}

Expand Down
4 changes: 2 additions & 2 deletions proto/job.proto
Original file line number Diff line number Diff line change
Expand Up @@ -400,8 +400,8 @@ enum CheckpointState {
enum FrameExitStatus {
// The frame was a success
SUCCESS = 0;
// The frame should not be retried
NO_RETRY = 256;
// The frame should be automatically retried
FAILED_LAUNCH = 256;
// Retries should not be incremented
SKIP_RETRY = 286;
}
Expand Down
2 changes: 1 addition & 1 deletion pycue/opencue/wrappers/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class CheckpointState(enum.IntEnum):
class FrameExitStatus(enum.IntEnum):
"""Possible frame exit statuses."""
SUCCESS = job_pb2.SUCCESS
NO_RETRY = job_pb2.NO_RETRY
FAILED_LAUNCH = job_pb2.FAILED_LAUNCH
SKIP_RETRY = job_pb2.SKIP_RETRY

class FrameState(enum.IntEnum):
Expand Down
6 changes: 3 additions & 3 deletions pycue/tests/wrappers/frame_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,9 +206,9 @@ def testCheckpointState(self):
self.assertEqual(opencue.api.Frame.CheckpointState.DISABLED, 0)

def testFrameExitStatus(self):
self.assertEqual(opencue.api.Frame.FrameExitStatus.NO_RETRY,
opencue.compiled_proto.job_pb2.NO_RETRY)
self.assertEqual(opencue.api.Frame.FrameExitStatus.NO_RETRY, 256)
self.assertEqual(opencue.api.Frame.FrameExitStatus.FAILED_LAUNCH,
opencue.compiled_proto.job_pb2.FAILED_LAUNCH)
self.assertEqual(opencue.api.Frame.FrameExitStatus.FAILED_LAUNCH, 256)

def testFrameState(self):
self.assertEqual(opencue.api.Frame.FrameState.RUNNING,
Expand Down

0 comments on commit 02cb67f

Please sign in to comment.