SLIDE 3 # hold jobs using absurd amounts of disk (50+ GB) or using more memory than requested. # not all of our jobs have RequestMemory defined SYSTEM_PERIODIC_HOLD = \ (JobUniverse == 5) && ( \ (JobStatus == 1 || JobStatus == 2) && ( \ (DiskUsage > 50000000) || \ (ResidentSetSize > 1000*2000 && ifThenElse(isUndefined(RequestMemory), True, \ ResidentSetSize > 1000*RequestMemory)) \ ) \ ) # Report why the stupid thing went on hold. SYSTEM_PERIODIC_HOLD_REASON = \ strcat("Job in status ", JobStatus, " put on hold by SYSTEM_PERIODIC_HOLD due to ", \ ifThenElse(isUndefined(ResidentSetSize) == False && ResidentSetSize > 1000*2000 && \ ifThenElse(isUndefined(RequestMemory), True, ResidentSetSize > 1000*RequestMemory), \ strcat("memory usage ", ResidentSetSize), \ strcat("disk usage ", DiskUsage)), ".") # forceful removal of running after 9 days, held jobs after 7 days # and anything trying to run more than 10 times (except users with user level checkpointing) SYSTEM_PERIODIC_REMOVE = \ (JobUniverse == 5) && ( \ (JobStatus == 2 && CurrentTime - EnteredCurrentStatus > 3600*24*9) || \ (JobStatus == 5 && CurrentTime - EnteredCurrentStatus > 3600*24*6) || \ ((JobRunCount >= 10) && (Owner =!= "bxie") && (Owner =!= "strolog")) \ ) # Record why the job was removed SYSTEM_PERIODIC_REMOVE_REASON = strcat("Job removed by SYSTEM_PERIODIC_REMOVE due to ", \ ifThenElse(JobStatus == 2 && CurrentTime - EnteredCurrentStatus > 3600*24*9, \ "runtime of longer than 9 days", \ ifThenElse(JobStatus == 5 && CurrentTime - EnteredCurrentStatus > 3600*24*6, \ "being in hold state for 7 days", \ "more than 10 restarts") \ ) )