summaryrefslogtreecommitdiff
path: root/.gitlab-ci
diff options
context:
space:
mode:
authorGuilherme Gallo <guilherme.gallo@collabora.com>2023-04-06 01:31:04 -0300
committerMarge Bot <emma+marge@anholt.net>2023-04-19 14:36:37 +0000
commit0ac3824922283d2604e87773dc2579b2b73f3a25 (patch)
treeea31405c3330fe9c8b58958157438e3ffd76b595 /.gitlab-ci
parent41f29c5333372a9aa5e0d6e05476ed529326afa5 (diff)
downloadmesa-0ac3824922283d2604e87773dc2579b2b73f3a25.tar.gz
ci/lava: Add a simple Structural Logger into submitter
Refactor some pieces of the submitter to improve the clarity of the functions and create a simple dictionary with aggregated data from the submitter execution which will be dumped to a file when the script exits. Add support for the AutoSaveDict based structured logger as well, which will come in a follow-up commit. Signed-off-by: Guilherme Gallo <guilherme.gallo@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22500>
Diffstat (limited to '.gitlab-ci')
-rwxr-xr-x.gitlab-ci/lava/lava-submit.sh1
-rwxr-xr-x.gitlab-ci/lava/lava_job_submitter.py70
-rw-r--r--.gitlab-ci/tests/conftest.py17
-rw-r--r--.gitlab-ci/tests/test_lava_job_submitter.py7
4 files changed, 80 insertions, 15 deletions
diff --git a/.gitlab-ci/lava/lava-submit.sh b/.gitlab-ci/lava/lava-submit.sh
index 5e1ec0c84ac..5f5870585aa 100755
--- a/.gitlab-ci/lava/lava-submit.sh
+++ b/.gitlab-ci/lava/lava-submit.sh
@@ -64,4 +64,5 @@ PYTHONPATH=artifacts/ artifacts/lava/lava_job_submitter.py \
--visibility-group "${VISIBILITY_GROUP}" \
--lava-tags "${LAVA_TAGS}" \
--mesa-job-name "$CI_JOB_NAME" \
+ --structured-log-file "results/lava_job_detail.json" \
>> results/lava.log
diff --git a/.gitlab-ci/lava/lava_job_submitter.py b/.gitlab-ci/lava/lava_job_submitter.py
index 0372fb40d55..0ba515a1bf5 100755
--- a/.gitlab-ci/lava/lava_job_submitter.py
+++ b/.gitlab-ci/lava/lava_job_submitter.py
@@ -11,9 +11,11 @@
import contextlib
+import json
import pathlib
import sys
import time
+from collections import defaultdict
from dataclasses import dataclass, fields
from datetime import datetime, timedelta
from io import StringIO
@@ -43,6 +45,10 @@ from lava.utils import (
)
from lavacli.utils import flow_yaml as lava_yaml
+# Initialize structural logging with a defaultdict, it can be changed for more
+# sophisticated dict-like data abstractions.
+STRUCTURAL_LOG = defaultdict(list)
+
# Timeout in seconds to decide if the device from the dispatched LAVA job has
# hung or not due to the lack of new log output.
DEVICE_HANGING_TIMEOUT_SEC = int(getenv("LAVA_DEVICE_HANGING_TIMEOUT_SEC", 5*60))
@@ -68,7 +74,11 @@ NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int(
getenv("LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION", 2)
)
-def find_exception_from_metadata(metadata, job_id):
+def raise_exception_from_metadata(metadata: dict, job_id: int) -> None:
+ """
+ Investigate infrastructure errors from the job metadata.
+ If it finds an error, raise it as MesaCIException.
+ """
if "result" not in metadata or metadata["result"] != "fail":
return
if "error_type" in metadata:
@@ -90,23 +100,22 @@ def find_exception_from_metadata(metadata, job_id):
raise MesaCIException(
f"LAVA job {job_id} failed validation (possible download error). Retry."
)
- return metadata
-def find_lava_error(job) -> None:
- # Look for infrastructure errors and retry if we see them.
+def raise_lava_error(job) -> None:
+ # Look for infrastructure errors, raise them, and retry if we see them.
results_yaml = call_proxy(job.proxy.results.get_testjob_results_yaml, job.job_id)
results = lava_yaml.load(results_yaml)
for res in results:
metadata = res["metadata"]
- find_exception_from_metadata(metadata, job.job_id)
+ raise_exception_from_metadata(metadata, job.job_id)
# If we reach this far, it means that the job ended without hwci script
# result and no LAVA infrastructure problem was found
job.status = "fail"
-def show_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{CONSOLE_LOG['FG_GREEN']}"):
+def show_final_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{CONSOLE_LOG['FG_GREEN']}"):
with GitlabSection(
"job_data",
"LAVA job info",
@@ -233,7 +242,8 @@ def follow_job_execution(job, log_follower):
# If this does not happen, it probably means a LAVA infrastructure error
# happened.
if job.status not in ["pass", "fail"]:
- find_lava_error(job)
+ raise_lava_error(job)
+
def structural_log_phases(job, log_follower):
phases: dict[str, Any] = {
@@ -244,6 +254,7 @@ def structural_log_phases(job, log_follower):
}
job.log["dut_job_phases"] = phases
+
def print_job_final_status(job):
if job.status == "running":
job.status = "hung"
@@ -256,13 +267,19 @@ def print_job_final_status(job):
)
job.refresh_log()
- job.log["status"] = job.status
- show_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{color}")
+ show_final_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{color}")
-def execute_job_with_retries(proxy, job_definition, retry_count) -> Optional[LAVAJob]:
+def execute_job_with_retries(
+ proxy, job_definition, retry_count, jobs_log
+) -> Optional[LAVAJob]:
for attempt_no in range(1, retry_count + 2):
- job = LAVAJob(proxy, job_definition)
+ # Need to get the logger value from its object to enable autosave
+ # features
+ jobs_log.append({})
+ job_log = jobs_log[-1]
+ job = LAVAJob(proxy, job_definition, job_log)
+ STRUCTURAL_LOG["dut_attempt_counter"] = attempt_no
try:
submit_job(job)
@@ -280,6 +297,7 @@ def execute_job_with_retries(proxy, job_definition, retry_count) -> Optional[LAV
)
finally:
+ job_log["finished_time"] = datetime.now().isoformat()
print_job_final_status(job)
@@ -287,7 +305,7 @@ def retriable_follow_job(proxy, job_definition) -> LAVAJob:
number_of_retries = NUMBER_OF_RETRIES_TIMEOUT_DETECTION
if finished_job := execute_job_with_retries(
- proxy, job_definition, number_of_retries
+ proxy, job_definition, number_of_retries, STRUCTURAL_LOG["dut_jobs"]
):
return finished_job
@@ -335,12 +353,16 @@ class LAVAJobSubmitter(PathResolver):
validate_only: bool = False # Whether to only validate the job, not execute it
visibility_group: str = None # Only affects LAVA farm maintainers
job_rootfs_overlay_url: str = None
+ structured_log_file: pathlib.Path = None # Log file path with structured LAVA log
def __post_init__(self) -> None:
super().__post_init__()
# Remove mesa job names with spaces, which breaks the lava-test-case command
self.mesa_job_name = self.mesa_job_name.split(" ")[0]
+ if self.structured_log_file:
+ self.setup_structured_logger()
+
def dump(self, job_definition):
if self.dump_yaml:
with GitlabSection(
@@ -377,10 +399,32 @@ class LAVAJobSubmitter(PathResolver):
if self.validate_only:
return
- finished_job = retriable_follow_job(proxy, job_definition)
+ try:
+ finished_job = retriable_follow_job(proxy, job_definition)
+ except Exception as exception:
+ STRUCTURAL_LOG["job_combined_fail_reason"] = str(exception)
+ raise exception
exit_code = 0 if finished_job.status == "pass" else 1
+ STRUCTURAL_LOG["job_combined_status"] = job.status
sys.exit(exit_code)
+ def setup_structured_logger(self):
+ try:
+ global STRUCTURAL_LOG
+ STRUCTURAL_LOG = StructuredLogger(
+ self.structured_log_file, truncate=True
+ ).data
+ except NameError as e:
+ print(
+ f"Could not import StructuredLogger library: {e}. "
+ "Falling back to DummyLogger"
+ )
+
+ STRUCTURAL_LOG["fixed_tags"] = self.lava_tags
+ STRUCTURAL_LOG["dut_job_type"] = self.device_type
+ STRUCTURAL_LOG["job_combined_fail_reason"] = None
+ STRUCTURAL_LOG["job_combined_status"] = "not_submitted"
+
if __name__ == "__main__":
# given that we proxy from DUT -> LAVA dispatcher -> LAVA primary -> us ->
diff --git a/.gitlab-ci/tests/conftest.py b/.gitlab-ci/tests/conftest.py
index be71c945301..ba021cc4719 100644
--- a/.gitlab-ci/tests/conftest.py
+++ b/.gitlab-ci/tests/conftest.py
@@ -1,3 +1,4 @@
+from collections import defaultdict
from unittest.mock import MagicMock, patch
import pytest
@@ -32,7 +33,7 @@ RESULT_GET_TESTJOB_RESULTS = [{"metadata": {"result": "test"}}]
@pytest.fixture
-def mock_proxy():
+def mock_proxy(frozen_time):
def create_proxy_mock(
job_results=RESULT_GET_TESTJOB_RESULTS,
testsuite_results=[generate_testsuite_result()],
@@ -51,6 +52,20 @@ def mock_proxy():
proxy_logs_mock = proxy_mock.scheduler.jobs.logs
proxy_logs_mock.return_value = jobs_logs_response()
+ proxy_job_state = proxy_mock.scheduler.job_state
+ proxy_job_state.return_value = {"job_state": "Running"}
+ proxy_job_state.side_effect = frozen_time.tick(1)
+
+ proxy_show_mock = proxy_mock.scheduler.jobs.show
+ proxy_show_mock.return_value = defaultdict(
+ str,
+ {
+ "device_type": "test_device",
+ "device": "test_device-cbg-1",
+ "state": "created",
+ },
+ )
+
for key, value in kwargs.items():
setattr(proxy_logs_mock, key, value)
diff --git a/.gitlab-ci/tests/test_lava_job_submitter.py b/.gitlab-ci/tests/test_lava_job_submitter.py
index d481aac4277..7ef2255550c 100644
--- a/.gitlab-ci/tests/test_lava_job_submitter.py
+++ b/.gitlab-ci/tests/test_lava_job_submitter.py
@@ -9,6 +9,7 @@ import xmlrpc.client
from contextlib import nullcontext as does_not_raise
from datetime import datetime
from itertools import chain, repeat
+from pathlib import Path
import pytest
from lava.exceptions import MesaCIException, MesaCIRetryError
@@ -16,6 +17,7 @@ from lava.lava_job_submitter import (
DEVICE_HANGING_TIMEOUT_SEC,
NUMBER_OF_RETRIES_TIMEOUT_DETECTION,
LAVAJob,
+ LAVAJobSubmitter,
bootstrap_log_follower,
follow_job_execution,
retriable_follow_job,
@@ -299,7 +301,7 @@ def test_parse_job_result_from_log(message, expectation, mock_proxy):
@pytest.mark.slow(
reason="Slow and sketchy test. Needs a LAVA log raw file at /tmp/log.yaml"
)
-def test_full_yaml_log(mock_proxy, frozen_time):
+def test_full_yaml_log(mock_proxy, frozen_time, tmp_path):
import random
from lavacli.utils import flow_yaml as lava_yaml
@@ -350,7 +352,10 @@ def test_full_yaml_log(mock_proxy, frozen_time):
def reset_logs(*args):
proxy.scheduler.jobs.logs.side_effect = load_lines()
+ tmp_file = Path(tmp_path) / "log.json"
+ LAVAJobSubmitter(structured_log_file=tmp_file)
proxy.scheduler.jobs.submit = reset_logs
with pytest.raises(MesaCIRetryError):
time_travel_to_test_time()
retriable_follow_job(proxy, "")
+ print(tmp_file.read_text())