Waits for a list of job IDs to complete. Returns when all jobs are completed, or when max_wait is exceeded.
Parameters: |
-
job_ids
(list )
–
-
scheduler
(str )
–
the scheduler to use. SGE or SLURM.
-
max_wait
(int , default:
3600
)
–
maximum time to wait in seconds. Default is 1 hour.
|
TODO: Add kill_after parameter to kill jobs if we exceed max_wait. Default false.
Source code in ribbon/utils.py
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225 | def wait_for_jobs(job_ids, scheduler, max_wait=3600):
"""
Waits for a list of job IDs to complete. Returns when all jobs are completed, or when max_wait is exceeded.
Args:
job_ids (list): list of job IDs
scheduler (str): the scheduler to use. SGE or SLURM.
max_wait (int): maximum time to wait in seconds. Default is 1 hour.
Returns:
None
TODO: Add kill_after parameter to kill jobs if we exceed max_wait. Default false.
"""
start_time = datetime.datetime.now()
if scheduler == 'SGE':
check_job_status = sge_check_job_status
elif scheduler == 'SLURM':
check_job_status = slurm_check_job_status
else:
raise ValueError('Invalid scheduler. Must be SGE or SLURM.')
# Print status:
waiting_for = len(job_ids)
print(f'Waiting for {waiting_for} jobs to complete...')
while True:
# Check if all jobs are completed:
all_completed = True
not_finished_count = 0
statuses = check_job_status(job_ids)
for job_id, status in statuses.items():
if status == 'not completed':
all_completed = False
not_finished_count += 1
if all_completed:
break # All jobs are completed, we're done!
# Print status, only when it changes:
if not_finished_count != waiting_for:
waiting_for = not_finished_count
print(f'Waiting for {waiting_for} jobs to complete...')
# Check if we've waited too long:
elapsed_time = (datetime.datetime.now() - start_time).seconds
if elapsed_time > max_wait:
print('Max wait time exceeded. Exiting.')
break
# Wait for a bit before checking again:
time.sleep(10)
return
|