diff --git a/tests/conftest.py b/tests/conftest.py
index a30ff3e7b..1cb660847 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -18,10 +18,11 @@ from __future__ import annotations
 
 import os
 from pathlib import Path
-from typing import TYPE_CHECKING, Generator, Iterator
+from typing import TYPE_CHECKING, Callable, Generator, Iterator
 
 import pytest
 import xdist
+from _pytest.reports import TestReport
 
 from trezorlib import debuglink, log, models
 from trezorlib.debuglink import TrezorClientDebugLink as Client
@@ -446,6 +447,20 @@ def pytest_runtest_makereport(item: pytest.Item, call) -> Generator:
     setattr(item, f"rep_{rep.when}", rep)
 
 
+@pytest.hookimpl(tryfirst=True)
+def pytest_report_teststatus(
+    report: TestReport, config: Config
+) -> tuple[str, str, tuple[str, dict[str, bool]]] | None:
+    if report.passed:
+        for prop, _ in report.user_properties:
+            if prop == "ui_failed":
+                return "ui_failed", "U", ("UI-FAILED", {"red": True})
+            if prop == "ui_missing":
+                return "ui_missing", "M", ("UI-MISSING", {"yellow": True})
+    # else use default handling
+    return None
+
+
 @pytest.fixture
 def device_handler(client: Client, request: pytest.FixtureRequest) -> Generator:
     device_handler = BackgroundDeviceHandler(client)
diff --git a/tests/ui_tests/__init__.py b/tests/ui_tests/__init__.py
index f4a6b8ff8..2d5ccbdd9 100644
--- a/tests/ui_tests/__init__.py
+++ b/tests/ui_tests/__init__.py
@@ -6,6 +6,7 @@ from typing import Callable, Generator
 
 import pytest
 from _pytest.outcomes import Failed
+from _pytest.nodes import Node
 
 from trezorlib.debuglink import TrezorClientDebugLink as Client
 
@@ -23,23 +24,13 @@ def _process_recorded(result: TestResult) -> None:
     testreport.recorded(result)
 
 
-def _process_tested(result: TestResult) -> None:
+def _process_tested(result: TestResult, item: Node) -> None:
     if result.expected_hash is None:
-        file_path = testreport.missing(result)
-        pytest.fail(
-            f"Hash of {result.test.id} not found in fixtures.json\n"
-            f"Expected:  {result.expected_hash}\n"
-            f"Actual:    {result.actual_hash}\n"
-            f"Diff file: {file_path}"
-        )
+        testreport.missing(result)
+        item.user_properties.append(("ui_missing", None))
     elif result.actual_hash != result.expected_hash:
-        file_path = testreport.failed(result)
-        pytest.fail(
-            f"Hash of {result.test.id} differs\n"
-            f"Expected:  {result.expected_hash}\n"
-            f"Actual:    {result.actual_hash}\n"
-            f"Diff file: {file_path}"
-        )
+        testreport.failed(result)
+        item.user_properties.append(("ui_failed", None))
     else:
         testreport.passed(result)
 
@@ -83,7 +74,7 @@ def screen_recording(
     if test_ui == "record":
         _process_recorded(result)
     else:
-        _process_tested(result)
+        _process_tested(result, request.node)
 
 
 def setup(main_runner: bool) -> None:
@@ -156,6 +147,9 @@ def terminal_summary(
 
     if normal_exit:
         println("-------- UI tests summary: --------")
+        for result in TestResult.recent_results():
+            if result.passed and not result.ui_passed:
+                println(f"UI_FAILED: {result.test.id} ({result.actual_hash})")
         println("Run ./tests/show_results.py to open test summary")
         println("")
 
@@ -176,15 +170,16 @@ def sessionfinish(
 
     testreport.generate_reports(record_text_layout, do_master_diff)
 
+    recents = list(TestResult.recent_results())
+
     if test_ui == "test":
-        common.write_fixtures_only_new_results(
-            TestResult.recent_results(),
-            dest=FIXTURES_RESULTS_FILE,
-        )
+        common.write_fixtures_only_new_results(recents, dest=FIXTURES_RESULTS_FILE)
+        if any(t.passed and not t.ui_passed for t in recents):
+            return pytest.ExitCode.TESTS_FAILED
 
     if test_ui == "test" and check_missing and list_missing():
         common.write_fixtures_complete(
-            TestResult.recent_results(),
+            recents,
             remove_missing=True,
             dest=FIXTURES_SUGGESTION_FILE,
         )
diff --git a/tests/ui_tests/common.py b/tests/ui_tests/common.py
index 9b9b7834a..4989ab149 100644
--- a/tests/ui_tests/common.py
+++ b/tests/ui_tests/common.py
@@ -322,7 +322,8 @@ class TestResult:
             json.dumps(metadata, indent=2, sort_keys=True) + "\n"
         )
 
-    def succeeded_in_ui_comparison(self) -> bool:
+    @property
+    def ui_passed(self) -> bool:
         return self.actual_hash == self.expected_hash
 
     @classmethod
@@ -354,7 +355,7 @@ class TestResult:
     def recent_ui_failures(cls) -> t.Iterator[Self]:
         """Returning just the results that resulted in UI failure."""
         for result in cls.recent_results():
-            if not result.succeeded_in_ui_comparison():
+            if not result.ui_passed:
                 yield result
 
     def store_recorded(self) -> None: