diff options
author | Guilherme Gallo <guilherme.gallo@collabora.com> | 2021-09-15 11:03:16 -0300 |
---|---|---|
committer | Marge Bot <eric+marge@anholt.net> | 2021-09-15 15:12:52 +0000 |
commit | 7244aa19806cec5265e1e219cac1a99b0d3c62c6 (patch) | |
tree | c45d28a43098bfc037cbc911cbab54a96219f89d /.gitlab-ci | |
parent | 465519679654636e2e585c573226f00f1f631d3f (diff) | |
download | mesa-7244aa19806cec5265e1e219cac1a99b0d3c62c6.tar.gz |
gitlab-ci: refactor timeout constants and tweak timeout values
* Refactor timeouts and retry attempts constants to variables in the top
of the python script.
* Increase LAVA job timeout value from 1 minute to 5 minutes, since the
timeout detection is just a heuristic based on the log silence in LAVA
devices. If we keep 1 minute timeout, maybe we could cancel jobs that
have tasks which may take too long to respond. Also, one minute
timeout is prone to misdetect scenarios when some network errors or
slowness may happen.
* Increase polling rate to check if the job has started from 1 check
every 30 seconds to 1 check every 10 seconds. Since it was taking 30
seconds in the worst case to start to get the log output from a LAVA
job. It is important to note that some LAVA jobs take less than 2
minutes to finish, so a 10 second wait would be more suitable in those
cases.
Signed-off-by: Guilherme Gallo <guilherme.gallo@collabora.com>
Reviewed-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12870>
Diffstat (limited to '.gitlab-ci')
-rwxr-xr-x | .gitlab-ci/lava/lava_job_submitter.py | 24 |
1 files changed, 19 insertions, 5 deletions
diff --git a/.gitlab-ci/lava/lava_job_submitter.py b/.gitlab-ci/lava/lava_job_submitter.py index e5ff2d8917e..a8f2ace8ded 100755 --- a/.gitlab-ci/lava/lava_job_submitter.py +++ b/.gitlab-ci/lava/lava_job_submitter.py @@ -37,6 +37,20 @@ import yaml from datetime import datetime, timedelta from lavacli.utils import loader +# Timeout in minutes to decide if the device from the dispatched LAVA job has +# hung or not due to the lack of new log output. +DEVICE_HANGING_TIMEOUT_MIN = 5 + +# How many seconds the script should wait before try a new polling iteration to +# check if the dispatched LAVA job is running or waiting in the job queue. +WAIT_FOR_DEVICE_POLLING_TIME_SEC = 10 + +# How many seconds to wait between log output LAVA RPC calls. +LOG_POLLING_TIME_SEC = 5 + +# How many retries should be made when a timeout happen. +NUMBER_OF_RETRIES_TIMEOUT_DETECTION = 2 + def print_log(msg): print("{}: {}".format(datetime.now(), msg)) @@ -112,7 +126,7 @@ def generate_lava_yaml(args): 'format': 'Lava-Test Test Definition 1.0', }, 'parse': { - 'pattern': 'hwci: (?P<test_case_id>\S*):\s+(?P<result>(pass|fail))' + 'pattern': r'hwci: (?P<test_case_id>\S*):\s+(?P<result>(pass|fail))' }, 'run': { }, @@ -218,7 +232,7 @@ def wait_until_job_is_started(proxy, job_id): job_state = _call_proxy(proxy.scheduler.job_state, job_id) current_state = job_state["job_state"] - time.sleep(30) + time.sleep(WAIT_FOR_DEVICE_POLLING_TIME_SEC) print_log(f"Job {job_id} started.") def follow_job_execution(proxy, job_id): @@ -237,7 +251,7 @@ def follow_job_execution(proxy, job_id): line_count += len(logs) else: - time_limit = timedelta(minutes=1) + time_limit = timedelta(minutes=DEVICE_HANGING_TIMEOUT_MIN) if datetime.now() - last_time_logs > time_limit: print_log("LAVA job {} doesn't advance (machine got hung?). Retry.".format(job_id)) return False @@ -245,7 +259,7 @@ def follow_job_execution(proxy, job_id): # `proxy.scheduler.jobs.logs` does not block, even when there is no # new log to be fetched. To avoid dosing the LAVA dispatcher # machine, let's add a sleep to save them some stamina. - time.sleep(5) + time.sleep(LOG_POLLING_TIME_SEC) return True @@ -282,7 +296,7 @@ def main(args): print("LAVA job definition validated successfully") return - retry_count = 2 + retry_count = NUMBER_OF_RETRIES_TIMEOUT_DETECTION while retry_count >= 0: job_id = submit_job(proxy, yaml_file) |