diff options
author | Guilherme Gallo <guilherme.gallo@collabora.com> | 2023-04-06 01:31:04 -0300 |
---|---|---|
committer | Marge Bot <emma+marge@anholt.net> | 2023-04-19 14:36:37 +0000 |
commit | 0ac3824922283d2604e87773dc2579b2b73f3a25 (patch) | |
tree | ea31405c3330fe9c8b58958157438e3ffd76b595 /.gitlab-ci | |
parent | 41f29c5333372a9aa5e0d6e05476ed529326afa5 (diff) | |
download | mesa-0ac3824922283d2604e87773dc2579b2b73f3a25.tar.gz |
ci/lava: Add a simple Structural Logger into submitter
Refactor some pieces of the submitter to improve the clarity of the
functions and create a simple dictionary with aggregated data from the
submitter execution which will be dumped to a file when the script
exits.
Add support for the AutoSaveDict based structured logger as well, which
will come in a follow-up commit.
Signed-off-by: Guilherme Gallo <guilherme.gallo@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22500>
Diffstat (limited to '.gitlab-ci')
-rwxr-xr-x | .gitlab-ci/lava/lava-submit.sh | 1 | ||||
-rwxr-xr-x | .gitlab-ci/lava/lava_job_submitter.py | 70 | ||||
-rw-r--r-- | .gitlab-ci/tests/conftest.py | 17 | ||||
-rw-r--r-- | .gitlab-ci/tests/test_lava_job_submitter.py | 7 |
4 files changed, 80 insertions, 15 deletions
diff --git a/.gitlab-ci/lava/lava-submit.sh b/.gitlab-ci/lava/lava-submit.sh index 5e1ec0c84ac..5f5870585aa 100755 --- a/.gitlab-ci/lava/lava-submit.sh +++ b/.gitlab-ci/lava/lava-submit.sh @@ -64,4 +64,5 @@ PYTHONPATH=artifacts/ artifacts/lava/lava_job_submitter.py \ --visibility-group "${VISIBILITY_GROUP}" \ --lava-tags "${LAVA_TAGS}" \ --mesa-job-name "$CI_JOB_NAME" \ + --structured-log-file "results/lava_job_detail.json" \ >> results/lava.log diff --git a/.gitlab-ci/lava/lava_job_submitter.py b/.gitlab-ci/lava/lava_job_submitter.py index 0372fb40d55..0ba515a1bf5 100755 --- a/.gitlab-ci/lava/lava_job_submitter.py +++ b/.gitlab-ci/lava/lava_job_submitter.py @@ -11,9 +11,11 @@ import contextlib +import json import pathlib import sys import time +from collections import defaultdict from dataclasses import dataclass, fields from datetime import datetime, timedelta from io import StringIO @@ -43,6 +45,10 @@ from lava.utils import ( ) from lavacli.utils import flow_yaml as lava_yaml +# Initialize structural logging with a defaultdict, it can be changed for more +# sophisticated dict-like data abstractions. +STRUCTURAL_LOG = defaultdict(list) + # Timeout in seconds to decide if the device from the dispatched LAVA job has # hung or not due to the lack of new log output. DEVICE_HANGING_TIMEOUT_SEC = int(getenv("LAVA_DEVICE_HANGING_TIMEOUT_SEC", 5*60)) @@ -68,7 +74,11 @@ NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int( getenv("LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION", 2) ) -def find_exception_from_metadata(metadata, job_id): +def raise_exception_from_metadata(metadata: dict, job_id: int) -> None: + """ + Investigate infrastructure errors from the job metadata. + If it finds an error, raise it as MesaCIException. + """ if "result" not in metadata or metadata["result"] != "fail": return if "error_type" in metadata: @@ -90,23 +100,22 @@ def find_exception_from_metadata(metadata, job_id): raise MesaCIException( f"LAVA job {job_id} failed validation (possible download error). Retry." ) - return metadata -def find_lava_error(job) -> None: - # Look for infrastructure errors and retry if we see them. +def raise_lava_error(job) -> None: + # Look for infrastructure errors, raise them, and retry if we see them. results_yaml = call_proxy(job.proxy.results.get_testjob_results_yaml, job.job_id) results = lava_yaml.load(results_yaml) for res in results: metadata = res["metadata"] - find_exception_from_metadata(metadata, job.job_id) + raise_exception_from_metadata(metadata, job.job_id) # If we reach this far, it means that the job ended without hwci script # result and no LAVA infrastructure problem was found job.status = "fail" -def show_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{CONSOLE_LOG['FG_GREEN']}"): +def show_final_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{CONSOLE_LOG['FG_GREEN']}"): with GitlabSection( "job_data", "LAVA job info", @@ -233,7 +242,8 @@ def follow_job_execution(job, log_follower): # If this does not happen, it probably means a LAVA infrastructure error # happened. if job.status not in ["pass", "fail"]: - find_lava_error(job) + raise_lava_error(job) + def structural_log_phases(job, log_follower): phases: dict[str, Any] = { @@ -244,6 +254,7 @@ def structural_log_phases(job, log_follower): } job.log["dut_job_phases"] = phases + def print_job_final_status(job): if job.status == "running": job.status = "hung" @@ -256,13 +267,19 @@ def print_job_final_status(job): ) job.refresh_log() - job.log["status"] = job.status - show_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{color}") + show_final_job_data(job, colour=f"{CONSOLE_LOG['BOLD']}{color}") -def execute_job_with_retries(proxy, job_definition, retry_count) -> Optional[LAVAJob]: +def execute_job_with_retries( + proxy, job_definition, retry_count, jobs_log +) -> Optional[LAVAJob]: for attempt_no in range(1, retry_count + 2): - job = LAVAJob(proxy, job_definition) + # Need to get the logger value from its object to enable autosave + # features + jobs_log.append({}) + job_log = jobs_log[-1] + job = LAVAJob(proxy, job_definition, job_log) + STRUCTURAL_LOG["dut_attempt_counter"] = attempt_no try: submit_job(job) @@ -280,6 +297,7 @@ def execute_job_with_retries(proxy, job_definition, retry_count) -> Optional[LAV ) finally: + job_log["finished_time"] = datetime.now().isoformat() print_job_final_status(job) @@ -287,7 +305,7 @@ def retriable_follow_job(proxy, job_definition) -> LAVAJob: number_of_retries = NUMBER_OF_RETRIES_TIMEOUT_DETECTION if finished_job := execute_job_with_retries( - proxy, job_definition, number_of_retries + proxy, job_definition, number_of_retries, STRUCTURAL_LOG["dut_jobs"] ): return finished_job @@ -335,12 +353,16 @@ class LAVAJobSubmitter(PathResolver): validate_only: bool = False # Whether to only validate the job, not execute it visibility_group: str = None # Only affects LAVA farm maintainers job_rootfs_overlay_url: str = None + structured_log_file: pathlib.Path = None # Log file path with structured LAVA log def __post_init__(self) -> None: super().__post_init__() # Remove mesa job names with spaces, which breaks the lava-test-case command self.mesa_job_name = self.mesa_job_name.split(" ")[0] + if self.structured_log_file: + self.setup_structured_logger() + def dump(self, job_definition): if self.dump_yaml: with GitlabSection( @@ -377,10 +399,32 @@ class LAVAJobSubmitter(PathResolver): if self.validate_only: return - finished_job = retriable_follow_job(proxy, job_definition) + try: + finished_job = retriable_follow_job(proxy, job_definition) + except Exception as exception: + STRUCTURAL_LOG["job_combined_fail_reason"] = str(exception) + raise exception exit_code = 0 if finished_job.status == "pass" else 1 + STRUCTURAL_LOG["job_combined_status"] = job.status sys.exit(exit_code) + def setup_structured_logger(self): + try: + global STRUCTURAL_LOG + STRUCTURAL_LOG = StructuredLogger( + self.structured_log_file, truncate=True + ).data + except NameError as e: + print( + f"Could not import StructuredLogger library: {e}. " + "Falling back to DummyLogger" + ) + + STRUCTURAL_LOG["fixed_tags"] = self.lava_tags + STRUCTURAL_LOG["dut_job_type"] = self.device_type + STRUCTURAL_LOG["job_combined_fail_reason"] = None + STRUCTURAL_LOG["job_combined_status"] = "not_submitted" + if __name__ == "__main__": # given that we proxy from DUT -> LAVA dispatcher -> LAVA primary -> us -> diff --git a/.gitlab-ci/tests/conftest.py b/.gitlab-ci/tests/conftest.py index be71c945301..ba021cc4719 100644 --- a/.gitlab-ci/tests/conftest.py +++ b/.gitlab-ci/tests/conftest.py @@ -1,3 +1,4 @@ +from collections import defaultdict from unittest.mock import MagicMock, patch import pytest @@ -32,7 +33,7 @@ RESULT_GET_TESTJOB_RESULTS = [{"metadata": {"result": "test"}}] @pytest.fixture -def mock_proxy(): +def mock_proxy(frozen_time): def create_proxy_mock( job_results=RESULT_GET_TESTJOB_RESULTS, testsuite_results=[generate_testsuite_result()], @@ -51,6 +52,20 @@ def mock_proxy(): proxy_logs_mock = proxy_mock.scheduler.jobs.logs proxy_logs_mock.return_value = jobs_logs_response() + proxy_job_state = proxy_mock.scheduler.job_state + proxy_job_state.return_value = {"job_state": "Running"} + proxy_job_state.side_effect = frozen_time.tick(1) + + proxy_show_mock = proxy_mock.scheduler.jobs.show + proxy_show_mock.return_value = defaultdict( + str, + { + "device_type": "test_device", + "device": "test_device-cbg-1", + "state": "created", + }, + ) + for key, value in kwargs.items(): setattr(proxy_logs_mock, key, value) diff --git a/.gitlab-ci/tests/test_lava_job_submitter.py b/.gitlab-ci/tests/test_lava_job_submitter.py index d481aac4277..7ef2255550c 100644 --- a/.gitlab-ci/tests/test_lava_job_submitter.py +++ b/.gitlab-ci/tests/test_lava_job_submitter.py @@ -9,6 +9,7 @@ import xmlrpc.client from contextlib import nullcontext as does_not_raise from datetime import datetime from itertools import chain, repeat +from pathlib import Path import pytest from lava.exceptions import MesaCIException, MesaCIRetryError @@ -16,6 +17,7 @@ from lava.lava_job_submitter import ( DEVICE_HANGING_TIMEOUT_SEC, NUMBER_OF_RETRIES_TIMEOUT_DETECTION, LAVAJob, + LAVAJobSubmitter, bootstrap_log_follower, follow_job_execution, retriable_follow_job, @@ -299,7 +301,7 @@ def test_parse_job_result_from_log(message, expectation, mock_proxy): @pytest.mark.slow( reason="Slow and sketchy test. Needs a LAVA log raw file at /tmp/log.yaml" ) -def test_full_yaml_log(mock_proxy, frozen_time): +def test_full_yaml_log(mock_proxy, frozen_time, tmp_path): import random from lavacli.utils import flow_yaml as lava_yaml @@ -350,7 +352,10 @@ def test_full_yaml_log(mock_proxy, frozen_time): def reset_logs(*args): proxy.scheduler.jobs.logs.side_effect = load_lines() + tmp_file = Path(tmp_path) / "log.json" + LAVAJobSubmitter(structured_log_file=tmp_file) proxy.scheduler.jobs.submit = reset_logs with pytest.raises(MesaCIRetryError): time_travel_to_test_time() retriable_follow_job(proxy, "") + print(tmp_file.read_text()) |