summaryrefslogtreecommitdiff
path: root/.gitlab-ci
diff options
context:
space:
mode:
authorGuilherme Gallo <guilherme.gallo@collabora.com>2021-09-15 11:03:16 -0300
committerMarge Bot <eric+marge@anholt.net>2021-09-15 15:12:52 +0000
commit7244aa19806cec5265e1e219cac1a99b0d3c62c6 (patch)
treec45d28a43098bfc037cbc911cbab54a96219f89d /.gitlab-ci
parent465519679654636e2e585c573226f00f1f631d3f (diff)
downloadmesa-7244aa19806cec5265e1e219cac1a99b0d3c62c6.tar.gz
gitlab-ci: refactor timeout constants and tweak timeout values
* Refactor timeouts and retry attempts constants to variables in the top of the python script. * Increase LAVA job timeout value from 1 minute to 5 minutes, since the timeout detection is just a heuristic based on the log silence in LAVA devices. If we keep 1 minute timeout, maybe we could cancel jobs that have tasks which may take too long to respond. Also, one minute timeout is prone to misdetect scenarios when some network errors or slowness may happen. * Increase polling rate to check if the job has started from 1 check every 30 seconds to 1 check every 10 seconds. Since it was taking 30 seconds in the worst case to start to get the log output from a LAVA job. It is important to note that some LAVA jobs take less than 2 minutes to finish, so a 10 second wait would be more suitable in those cases. Signed-off-by: Guilherme Gallo <guilherme.gallo@collabora.com> Reviewed-by: Tomeu Vizoso <tomeu.vizoso@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12870>
Diffstat (limited to '.gitlab-ci')
-rwxr-xr-x.gitlab-ci/lava/lava_job_submitter.py24
1 files changed, 19 insertions, 5 deletions
diff --git a/.gitlab-ci/lava/lava_job_submitter.py b/.gitlab-ci/lava/lava_job_submitter.py
index e5ff2d8917e..a8f2ace8ded 100755
--- a/.gitlab-ci/lava/lava_job_submitter.py
+++ b/.gitlab-ci/lava/lava_job_submitter.py
@@ -37,6 +37,20 @@ import yaml
from datetime import datetime, timedelta
from lavacli.utils import loader
+# Timeout in minutes to decide if the device from the dispatched LAVA job has
+# hung or not due to the lack of new log output.
+DEVICE_HANGING_TIMEOUT_MIN = 5
+
+# How many seconds the script should wait before try a new polling iteration to
+# check if the dispatched LAVA job is running or waiting in the job queue.
+WAIT_FOR_DEVICE_POLLING_TIME_SEC = 10
+
+# How many seconds to wait between log output LAVA RPC calls.
+LOG_POLLING_TIME_SEC = 5
+
+# How many retries should be made when a timeout happen.
+NUMBER_OF_RETRIES_TIMEOUT_DETECTION = 2
+
def print_log(msg):
print("{}: {}".format(datetime.now(), msg))
@@ -112,7 +126,7 @@ def generate_lava_yaml(args):
'format': 'Lava-Test Test Definition 1.0',
},
'parse': {
- 'pattern': 'hwci: (?P<test_case_id>\S*):\s+(?P<result>(pass|fail))'
+ 'pattern': r'hwci: (?P<test_case_id>\S*):\s+(?P<result>(pass|fail))'
},
'run': {
},
@@ -218,7 +232,7 @@ def wait_until_job_is_started(proxy, job_id):
job_state = _call_proxy(proxy.scheduler.job_state, job_id)
current_state = job_state["job_state"]
- time.sleep(30)
+ time.sleep(WAIT_FOR_DEVICE_POLLING_TIME_SEC)
print_log(f"Job {job_id} started.")
def follow_job_execution(proxy, job_id):
@@ -237,7 +251,7 @@ def follow_job_execution(proxy, job_id):
line_count += len(logs)
else:
- time_limit = timedelta(minutes=1)
+ time_limit = timedelta(minutes=DEVICE_HANGING_TIMEOUT_MIN)
if datetime.now() - last_time_logs > time_limit:
print_log("LAVA job {} doesn't advance (machine got hung?). Retry.".format(job_id))
return False
@@ -245,7 +259,7 @@ def follow_job_execution(proxy, job_id):
# `proxy.scheduler.jobs.logs` does not block, even when there is no
# new log to be fetched. To avoid dosing the LAVA dispatcher
# machine, let's add a sleep to save them some stamina.
- time.sleep(5)
+ time.sleep(LOG_POLLING_TIME_SEC)
return True
@@ -282,7 +296,7 @@ def main(args):
print("LAVA job definition validated successfully")
return
- retry_count = 2
+ retry_count = NUMBER_OF_RETRIES_TIMEOUT_DETECTION
while retry_count >= 0:
job_id = submit_job(proxy, yaml_file)