chromium_src/agents/testing/workers_unittest.py · OpenHarmony-TPC/chromium_src - AtomGit

#!/usr/bin/env vpython3
# Copyright 2025 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Tests for workers."""

import json
import pathlib
import queue
import shutil
import subprocess
import time
import unittest
from unittest import mock

from pyfakefs import fake_filesystem_unittest

import promptfoo_installation
import results
import workers
import eval_config

# pylint: disable=protected-access

_POLLING_INTERVAL = 0.001


class WorkDirUnittest(fake_filesystem_unittest.TestCase):
    """Unit tests for the WorkDir class."""

    def setUp(self):
        self.setUpPyfakefs()
        self.fs.create_dir('/tmp/src')
        self._setUpPatches()

    def _setUpPatches(self):
        """Set up patches for the tests."""
        rmtree_patcher = mock.patch('shutil.rmtree')
        self.mock_rmtree = rmtree_patcher.start()
        self.addCleanup(rmtree_patcher.stop)

        call_patcher = mock.patch('subprocess.call')
        self.mock_call = call_patcher.start()
        self.addCleanup(call_patcher.stop)

        check_call_patcher = mock.patch('subprocess.check_call')
        self.mock_check_call = check_call_patcher.start()
        self.addCleanup(check_call_patcher.stop)

        check_btrfs_patcher = mock.patch('checkout_helpers.check_btrfs')
        self.mock_check_btrfs = check_btrfs_patcher.start()
        self.addCleanup(check_btrfs_patcher.stop)

    def test_enter_btrfs(self):
        """Tests that a btrfs snapshot is created when btrfs is true."""
        self.mock_check_btrfs.return_value = True
        workdir = workers.WorkDir('workdir',
                                  pathlib.Path('/tmp/src'),
                                  clean=False,
                                  verbose=False,
                                  force=False)
        with workdir as w:
            self.assertEqual(w, workdir)

        self.mock_check_call.assert_called_once_with(
            [
                'btrfs',
                'subvol',
                'snapshot',
                pathlib.Path('/tmp/src'),
                pathlib.Path('/tmp/workdir'),
            ],
            stdout=subprocess.DEVNULL,
            stderr=subprocess.STDOUT,
        )

    def test_enter_no_btrfs(self):
        """Tests that gclient-new-workdir is called when btrfs is false."""
        self.mock_check_btrfs.return_value = False
        workdir = workers.WorkDir('workdir',
                                  pathlib.Path('/tmp/src'),
                                  clean=False,
                                  verbose=False,
                                  force=False)
        with workdir as w:
            self.assertEqual(w, workdir)

        self.mock_check_call.assert_called_once_with(
            [
                'gclient-new-workdir.py',
                pathlib.Path('/tmp/src'),
                pathlib.Path('/tmp/workdir'),
            ],
            stdout=subprocess.DEVNULL,
            stderr=subprocess.STDOUT,
        )

    def test_enter_verbose(self):
        """Tests that verbose logging is enabled when verbose is true."""
        self.mock_check_btrfs.return_value = False
        workdir = workers.WorkDir('workdir',
                                  pathlib.Path('/tmp/src'),
                                  clean=False,
                                  verbose=True,
                                  force=False)
        with workdir as w:
            self.assertEqual(w, workdir)

        self.mock_check_call.assert_called_once_with(
            [
                'gclient-new-workdir.py',
                pathlib.Path('/tmp/src'),
                pathlib.Path('/tmp/workdir'),
            ],
            stdout=None,  # Output surfaced instead of going to DEVNULL.
            stderr=subprocess.STDOUT,
        )

    def test_enter_exists(self):
        """Tests that the workdir is removed if it exists."""
        self.fs.create_dir('/tmp/workdir')
        self.mock_check_btrfs.return_value = True
        workdir = workers.WorkDir('workdir',
                                  pathlib.Path('/tmp/src'),
                                  clean=False,
                                  verbose=False,
                                  force=True)
        with workdir:
            pass

        self.mock_call.assert_called_once_with(
            [
                'sudo',
                '-n',
                'btrfs',
                'subvolume',
                'delete',
                pathlib.Path('/tmp/workdir'),
            ],
            stdout=subprocess.DEVNULL,
            stderr=subprocess.STDOUT,
        )

    def test_exit_clean_btrfs(self):
        """Tests that the workdir is removed when clean is true w/ btrfs ."""
        self.mock_check_btrfs.return_value = True
        workdir = workers.WorkDir('workdir',
                                  pathlib.Path('/tmp/src'),
                                  clean=True,
                                  verbose=False,
                                  force=False)
        with workdir:
            pass

        self.mock_call.assert_called_once_with(
            [
                'sudo',
                'btrfs',
                'subvolume',
                'delete',
                pathlib.Path('/tmp/workdir'),
            ],
            stdout=subprocess.DEVNULL,
            stderr=subprocess.STDOUT,
        )

    def test_exit_clean_no_btrfs(self):
        """Tests that the workdir is removed when clean is True w/o btrfs."""
        self.mock_check_btrfs.return_value = False
        workdir = workers.WorkDir('workdir',
                                  pathlib.Path('/tmp/src'),
                                  clean=True,
                                  verbose=False,
                                  force=False)
        with workdir:
            pass

        self.mock_rmtree.assert_called_once_with(pathlib.Path('/tmp/workdir'))

    def test_exit_no_clean(self):
        """Tests that the workdir is not cleaned up when clean is False."""
        self.mock_check_btrfs.return_value = False
        workdir = workers.WorkDir('workdir',
                                  pathlib.Path('/tmp/src'),
                                  clean=False,
                                  verbose=False,
                                  force=False)
        with workdir:
            pass

        self.mock_call.assert_not_called()
        self.mock_rmtree.assert_not_called()


    def test_exit_clean_btrfs_fallback(self):
        """Tests that shutil is used when btrfs subvolume delete fails."""
        self.mock_check_btrfs.return_value = True
        self.mock_call.return_value = 1
        workdir = workers.WorkDir('workdir',
                                  pathlib.Path('/tmp/src'),
                                  clean=True,
                                  verbose=False,
                                  force=False)
        with workdir:
            pass

        self.mock_call.assert_called_once_with(
            [
                'sudo',
                'btrfs',
                'subvolume',
                'delete',
                pathlib.Path('/tmp/workdir'),
            ],
            stdout=subprocess.DEVNULL,
            stderr=subprocess.STDOUT,
        )
        self.mock_rmtree.assert_called_once_with(pathlib.Path('/tmp/workdir'))


class ExtractMetricsUnittest(fake_filesystem_unittest.TestCase):
    """Unit tests for the _extract_metrics_from_promptfoo_results."""

    def setUp(self):
        self.setUpPyfakefs()

    def test_success(self):
        """Tests a successful extraction."""
        results_data = {
            'results': {
                'results': [
                    {
                        'score': 0.5,
                        'response': {
                            'metrics': {
                                'gemini_cli_token_usage': {
                                    'total_tokens': 10,
                                },
                            },
                        },
                    },
                ],
            },
        }
        metrics = workers._extract_metrics_from_promptfoo_results(results_data)
        self.assertEqual(metrics, {
            'token_usage': {
                'total_tokens': 10
            },
            'score': 0.5
        })

    def test_no_score(self):
        """Tests when the score is missing."""
        results_data = {
            'results': {
                'results': [
                    {
                        'response': {
                            'metrics': {
                                'gemini_cli_token_usage': {
                                    'total_tokens': 10,
                                },
                            },
                        },
                    },
                ],
            },
        }
        metrics = workers._extract_metrics_from_promptfoo_results(results_data)
        self.assertEqual(metrics, {'token_usage': {'total_tokens': 10}})

    def test_no_token_usage(self):
        """Tests when token usage is missing."""
        results_data = {
            'results': {
                'results': [
                    {
                        'score': 0.5,
                        'response': {
                            'metrics': {},
                        },
                    },
                ],
            },
        }
        metrics = workers._extract_metrics_from_promptfoo_results(results_data)
        self.assertEqual(metrics, {'token_usage': {}, 'score': 0.5})

    def test_empty_results(self):
        """Tests when the results file is empty."""
        metrics = workers._extract_metrics_from_promptfoo_results({})
        self.assertEqual(metrics, {})


class ParseTestLogResultsTest(unittest.TestCase):

    def test_empty_json(self):
        self.assertEqual(workers._parse_test_log_results(None), '')
        self.assertEqual(workers._parse_test_log_results({}),
                         'No results found in promptfoo output.')

    def test_empty_results_list(self):
        json_data = {'results': {'results': []}}
        self.assertEqual(workers._parse_test_log_results(json_data),
                         'No results found in promptfoo output.')

    def test_missing_keys(self):
        json_data = {'results': {'results': [{}]}}
        expected = ('Input prompt: None\n'
                    'Response: None\n'
                    'Assertion results:\n')
        self.assertEqual(workers._parse_test_log_results(json_data), expected)

        json_data = {'results': {'results': [{'gradingResult': {}}]}}
        self.assertEqual(workers._parse_test_log_results(json_data), expected)

        json_data = {
            'results': {
                'results': [{
                    'gradingResult': {
                        'componentResults': []
                    }
                }]
            }
        }
        self.assertEqual(workers._parse_test_log_results(json_data), expected)

    def test_full_json(self):
        json_data = {
            "results": {
                "results": [{
                    "gradingResult": {
                        "componentResults": [{
                            "pass": True,
                            "reason": "Looks good",
                            "score": 1.0
                        }, {
                            "pass": False,
                            "reason": "Not so good",
                            "score": 0.0
                        }]
                    },
                    "response": {
                        "metrics": {
                            "user_prompt": "This is the prompt.",
                            "full_output": "This is the output."
                        }
                    }
                }]
            }
        }
        expected_output = ("Input prompt: This is the prompt.\n"
                           "Response: This is the output.\n"
                           "Assertion results:\n"
                           "pass: True\nreason: Looks good\nscore: 1.0\n\n"
                           "pass: False\nreason: Not so good\nscore: 0.0\n\n")
        self.assertEqual(workers._parse_test_log_results(json_data),
                         expected_output)

    def test_no_component_results(self):
        json_data = {
            "results": {
                "results": [{
                    "gradingResult": {
                        "componentResults": []
                    },
                    "response": {
                        "metrics": {
                            "user_prompt": "This is the prompt.",
                            "full_output": "This is the output."
                        }
                    }
                }]
            }
        }
        expected_output = ("Input prompt: This is the prompt.\n"
                           "Response: This is the output.\n"
                           "Assertion results:\n")
        self.assertEqual(workers._parse_test_log_results(json_data),
                         expected_output)


class LoadPromptfooResultsUnittest(fake_filesystem_unittest.TestCase):
    """Unit tests for the _load_promptfoo_results."""

    def setUp(self):
        self.setUpPyfakefs()

    def test_success(self):
        """Tests a successful load."""
        results_data = {
            'results': {
                'results': [],
            },
        }
        results_content = json.dumps(results_data)
        results_file = pathlib.Path('/results.json')
        self.fs.create_file(results_file, contents=results_content)
        data = workers._load_promptfoo_results(results_file)
        self.assertEqual(data, results_data)

    def test_invalid_json(self):
        """Tests with invalid JSON content."""
        results_file = pathlib.Path('/results.json')
        self.fs.create_file(results_file, contents='{invalid json')
        with self.assertLogs(level='ERROR') as cm:
            data = workers._load_promptfoo_results(results_file)
            self.assertIn('Error when parsing promptfoo results', cm.output[0])
        self.assertEqual(data, {})

    def test_unicode_error(self):
        """Tests with invalid unicode content."""
        results_file = pathlib.Path('/results.json')
        with open(results_file, 'wb') as f:
            f.write(b'\x80')
        with self.assertLogs(level='ERROR') as cm:
            data = workers._load_promptfoo_results(results_file)
            self.assertIn('Error when parsing promptfoo results', cm.output[0])
        self.assertEqual(data, {})


class ExtractTokenUsageUnittest(unittest.TestCase):
    """Unit tests for the _extract_token_usage_from_promptfoo_results."""

    def test_success(self):
        """Tests a successful extraction."""
        results_data = {
            'results': {
                'results': [
                    {
                        'response': {
                            'metrics': {
                                'gemini_cli_token_usage': {
                                    'total_tokens': 10,
                                    'prompt_tokens': 5,
                                    'completion_tokens': 5,
                                },
                            },
                        },
                    },
                ],
            },
        }
        token_usage = workers._extract_token_usage_from_promptfoo_results(
            results_data)
        self.assertEqual(token_usage, {
            'total_tokens': 10,
            'prompt_tokens': 5,
            'completion_tokens': 5,
        })

    def test_no_results_key(self):
        """Tests when the top-level 'results' key is missing."""
        with self.assertLogs(level='ERROR') as cm:
            token_usage = workers._extract_token_usage_from_promptfoo_results(
                {})
            self.assertIn('Did not find promptfoo result information',
                          cm.output[0])
        self.assertEqual(token_usage, {})

    def test_no_nested_results_key(self):
        """Tests when the nested 'results' key is missing."""
        with self.assertLogs(level='ERROR') as cm:
            token_usage = workers._extract_token_usage_from_promptfoo_results({
                'results': {},
            })
            self.assertIn('Did not find promptfoo result information',
                          cm.output[0])
        self.assertEqual(token_usage, {})

    def test_empty_results_list(self):
        """Tests when the results list is empty."""
        results_data = {
            'results': {
                'results': [],
            },
        }
        with self.assertLogs(level='ERROR') as cm:
            token_usage = workers._extract_token_usage_from_promptfoo_results(
                results_data)
            self.assertIn('Did not find promptfoo result information',
                          cm.output[0])
        self.assertEqual(token_usage, {})

    def test_multiple_results(self):
        """Tests that only the first result is used when there are many."""
        results_data = {
            'results': {
                'results': [
                    {
                        'response': {
                            'metrics': {
                                'gemini_cli_token_usage': {
                                    'total_tokens': 10,
                                },
                            },
                        },
                    },
                    {
                        'response': {
                            'metrics': {
                                'gemini_cli_token_usage': {
                                    'total_tokens': 20,
                                },
                            },
                        },
                    },
                ],
            },
        }
        with self.assertLogs(level='WARNING') as cm:
            token_usage = workers._extract_token_usage_from_promptfoo_results(
                results_data)
            self.assertIn('Unexpectedly got 2 results', cm.output[0])
        self.assertEqual(token_usage, {'total_tokens': 10})

    def test_no_response_key(self):
        """Tests when the 'response' key is missing."""
        results_data = {
            'results': {
                'results': [
                    {},
                ],
            },
        }
        with self.assertLogs(level='WARNING') as cm:
            token_usage = workers._extract_token_usage_from_promptfoo_results(
                results_data)
            self.assertIn('Did not find gemini-cli token usage', cm.output[0])
        self.assertEqual(token_usage, {})

    def test_no_metrics_key(self):
        """Tests when the 'metrics' key is missing."""
        results_data = {
            'results': {
                'results': [
                    {
                        'response': {},
                    },
                ],
            },
        }
        with self.assertLogs(level='WARNING') as cm:
            token_usage = workers._extract_token_usage_from_promptfoo_results(
                results_data)
            self.assertIn('Did not find gemini-cli token usage', cm.output[0])
        self.assertEqual(token_usage, {})

    def test_no_token_usage_key(self):
        """Tests when the 'gemini_cli_token_usage' key is missing."""
        results_data = {
            'results': {
                'results': [
                    {
                        'response': {
                            'metrics': {},
                        },
                    },
                ],
            },
        }
        with self.assertLogs(level='WARNING') as cm:
            token_usage = workers._extract_token_usage_from_promptfoo_results(
                results_data)
            self.assertIn('Did not find gemini-cli token usage', cm.output[0])
        self.assertEqual(token_usage, {})

    def test_empty_token_usage_dict(self):
        """Tests when the token usage dict is empty."""
        results_data = {
            'results': {
                'results': [
                    {
                        'response': {
                            'metrics': {
                                'gemini_cli_token_usage': {},
                            },
                        },
                    },
                ],
            },
        }
        with self.assertLogs(level='WARNING') as cm:
            token_usage = workers._extract_token_usage_from_promptfoo_results(
                results_data)
            self.assertIn('Did not find gemini-cli token usage', cm.output[0])
        self.assertEqual(token_usage, {})


class ExtractScoreUnittest(unittest.TestCase):
    """Unit tests for the _extract_score_from_promptfoo_results."""

    def test_success(self):
        """Tests a successful extraction."""
        results_data = {
            'results': {
                'results': [
                    {
                        'score': 0.5,
                    },
                ],
            },
        }
        score = workers._extract_score_from_promptfoo_results(results_data)
        self.assertEqual(score, 0.5)

    def test_no_score(self):
        """Tests when the score is missing."""
        results_data = {
            'results': {
                'results': [
                    {},
                ],
            },
        }
        with self.assertLogs(level='WARNING') as cm:
            score = workers._extract_score_from_promptfoo_results(results_data)
            self.assertIn('Did not find reported score', cm.output[0])
        self.assertIsNone(score)

    def test_multiple_results(self):
        """Tests that only the first result is used when there are many."""
        results_data = {
            'results': {
                'results': [
                    {
                        'score': 0.5,
                    },
                    {
                        'score': 1.0,
                    },
                ],
            },
        }
        with self.assertLogs(level='WARNING') as cm:
            score = workers._extract_score_from_promptfoo_results(results_data)
            self.assertIn('Unexpectedly got 2 results', cm.output[0])
        self.assertEqual(score, 0.5)

    def test_empty_results(self):
        """Tests when the results list is empty."""
        results_data = {
            'results': {
                'results': [],
            },
        }
        with self.assertLogs(level='ERROR') as cm:
            score = workers._extract_score_from_promptfoo_results(results_data)
            self.assertIn('Did not find promptfoo result information',
                          cm.output[0])
        self.assertIsNone(score)


class WorkerThreadUnittest(unittest.TestCase):
    """Unit tests for the WorkerThread class."""

    def setUp(self):
        self._setUpMocks()
        self._setUpPatches()

    def _setUpMocks(self):
        """Set up mocks for the tests."""
        self.mock_promptfoo = mock.Mock(
            spec=promptfoo_installation.PromptfooInstallation)
        self.mock_promptfoo.run.return_value = subprocess.CompletedProcess(
            args=[], returncode=0, stdout='Success')
        self.worker_options = workers.WorkerOptions(
            clean=True,
            verbose=False,
            force=False,
            sandbox=False,
            gemini_cli_bin=None,
        )
        self.test_input_queue = queue.Queue()
        self.test_result_queue = queue.Queue()

    def _setUpPatches(self):
        """Set up patches for the tests."""
        workdir_patcher = mock.patch('workers.WorkDir')
        self.mock_workdir = workdir_patcher.start()
        mock_workdir_instance = (
            self.mock_workdir.return_value.__enter__.return_value)
        mock_workdir_instance.path = pathlib.Path('/workdir')
        self.addCleanup(workdir_patcher.stop)

        polling_patcher = mock.patch(
            'workers._AVAILABLE_TEST_POLLING_SLEEP_DURATION',
            _POLLING_INTERVAL)
        polling_patcher.start()
        self.addCleanup(polling_patcher.stop)

        get_gclient_root_patcher = mock.patch(
            'checkout_helpers.get_gclient_root')
        self.mock_get_gclient_root = get_gclient_root_patcher.start()
        self.mock_get_gclient_root.return_value = pathlib.Path('/root')
        self.addCleanup(get_gclient_root_patcher.stop)

    def _create_and_run_worker(self, configs):
        """Helper to create and run a worker thread."""
        for config in configs:
            self.test_input_queue.put(config)

        worker = workers.WorkerThread(
            worker_index=0,
            promptfoo=self.mock_promptfoo,
            worker_options=self.worker_options,
            test_input_queue=self.test_input_queue,
            test_result_queue=self.test_result_queue,
        )
        worker.start()

        while self.test_result_queue.qsize() < len(configs):
            worker.maybe_reraise_fatal_exception()
            time.sleep(_POLLING_INTERVAL)

        worker.shutdown()
        worker.join(1)
        return worker

    def test_run_one_test_pass(self):
        config = eval_config.TestConfig(test_file=pathlib.Path('/test/a.yaml'))
        self._create_and_run_worker([config])

        self.mock_workdir.assert_called_once_with('workdir-0',
                                                  pathlib.Path('/root'), True,
                                                  False, False)
        self.mock_promptfoo.run.assert_called_once()
        self.assertEqual(self.test_result_queue.qsize(), 1)
        result = self.test_result_queue.get()
        self.assertEqual(result.config.test_file, config.test_file)
        self.assertTrue(result.success)

    def test_run_one_test_fail(self):
        """Tests running a single failing test."""
        self.mock_promptfoo.run.return_value = subprocess.CompletedProcess(
            args=[], returncode=1, stdout='Failure')
        config = eval_config.TestConfig(test_file=pathlib.Path('/test/a.yaml'))
        self._create_and_run_worker([config])

        self.assertEqual(self.test_result_queue.qsize(), 1)
        result = self.test_result_queue.get()
        self.assertEqual(result.config.test_file, config.test_file)
        self.assertFalse(result.success)

    def test_run_multiple_tests(self):
        configs = [
            eval_config.TestConfig(test_file=pathlib.Path('/test/a.yaml')),
            eval_config.TestConfig(test_file=pathlib.Path('/test/b.yaml'))
        ]
        self._create_and_run_worker(configs)

        self.assertEqual(self.mock_workdir.call_count, 2)
        self.assertEqual(self.mock_promptfoo.run.call_count, 2)
        self.assertEqual(self.test_result_queue.qsize(), 2)

    def test_shutdown(self):
        """Tests that the worker thread shuts down gracefully."""
        worker = self._create_and_run_worker([])
        self.assertFalse(worker.is_alive())

    def test_fatal_exception(self):
        """Tests that fatal exceptions are propagated."""
        worker = self._create_and_run_worker([])
        with mock.patch.object(worker,
                               '_run_incoming_tests_until_shutdown',
                               side_effect=ValueError('Test Error')):
            worker.run()

        with self.assertRaisesRegex(ValueError, 'Test Error'):
            worker.maybe_reraise_fatal_exception()

    def test_no_fatal_exception(self):
        """Tests that no exception is raised when there is no fatal error."""
        worker = self._create_and_run_worker([])
        # Should be a no-op.
        worker.maybe_reraise_fatal_exception()

    def test_sandbox_and_verbose(self):
        """Tests that sandbox and verbose flags are passed to promptfoo."""
        self.worker_options.sandbox = True
        self.worker_options.verbose = True
        self._create_and_run_worker(
            [eval_config.TestConfig(test_file=pathlib.Path('/test/a.yaml'))])

        self.mock_promptfoo.run.assert_called_once()
        command = self.mock_promptfoo.run.call_args[0][0]
        self.assertIn('--var', command)
        self.assertIn('sandbox=True', command)
        self.assertIn('verbose=True', command)
        self.assertIn(f'console_width={shutil.get_terminal_size().columns}',
                      command)

    def test_gemini_cli_bin(self):
        """Tests that gemini_cli_bin is passed to promptfoo."""
        gemini_cli_bin = pathlib.Path('/', 'custom', 'gemini')
        self.worker_options.gemini_cli_bin = gemini_cli_bin
        self._create_and_run_worker(
            [eval_config.TestConfig(test_file=pathlib.Path('/test/a.yaml'))])

        self.mock_promptfoo.run.assert_called_once()
        command = self.mock_promptfoo.run.call_args[0][0]
        self.assertIn('--var', command)
        self.assertIn(f'gemini_cli_bin={gemini_cli_bin}', command)


class RunOneConfigTest(WorkerThreadUnittest):
    """Tests for the `_run_one_config` method in `workers.py`."""

    def test_aggregation(self):
        """Tests that all metrics are aggregated correctly."""
        config = eval_config.TestConfig(test_file=pathlib.Path('/test/a.yaml'),
                                        runs_per_test=3,
                                        pass_k_threshold=2)
        results_to_return = [
            results.IterationResult(
                success=True,
                duration=1.0,
                test_log='log1',
                metrics={
                    'token_usage': {
                        'total': 10
                    },
                },
                prompt=None,
                response=None,
            ),
            results.IterationResult(
                success=False,
                duration=1.5,
                test_log='log2',
                metrics={
                    'token_usage': {
                        'total': 5
                    },
                },
                prompt=None,
                response=None,
            ),
            results.IterationResult(
                success=True,
                duration=2.0,
                test_log='log3',
                metrics={
                    'token_usage': {
                        'total': 15
                    },
                },
                prompt=None,
                response=None,
            ),
        ]
        with mock.patch.object(workers.WorkerThread,
                               '_run_single_iteration',
                               side_effect=results_to_return):
            self._create_and_run_worker([config])

        self.assertEqual(self.test_result_queue.qsize(), 1)
        result = self.test_result_queue.get()
        self.assertTrue(result.success)
        self.assertEqual(result.successful_runs, 2)
        self.assertEqual(result.total_duration, 4.5)
        self.assertEqual(result.average_duration, 1.5)
        self.assertEqual(
            result.combined_logs, 'Iteration #0:\nlog1\n'
            'Iteration #1:\nlog2\n'
            'Iteration #2:\nlog3')

    def test_success_criteria_pass(self):
        """Tests that a test is marked as successful when it passes."""
        config = eval_config.TestConfig(test_file=pathlib.Path('/test/a.yaml'),
                                        runs_per_test=3,
                                        pass_k_threshold=2)
        results_to_return = [
            results.IterationResult(
                success=True,
                duration=1,
                test_log='',
                metrics={},
                prompt=None,
                response=None,
            ),
            results.IterationResult(
                success=False,
                duration=1,
                test_log='',
                metrics={},
                prompt=None,
                response=None,
            ),
            results.IterationResult(
                success=True,
                duration=1,
                test_log='',
                metrics={},
                prompt=None,
                response=None,
            ),
        ]
        with mock.patch.object(workers.WorkerThread,
                               '_run_single_iteration',
                               side_effect=results_to_return):
            self._create_and_run_worker([config])

        self.assertEqual(self.test_result_queue.qsize(), 1)
        result = self.test_result_queue.get()
        self.assertTrue(result.success)
        self.assertEqual(result.successful_runs, 2)

    def test_success_criteria_fail(self):
        """Tests that a test is marked as failed when it does not pass."""
        config = eval_config.TestConfig(test_file=pathlib.Path('/test/a.yaml'),
                                        runs_per_test=3,
                                        pass_k_threshold=3)
        results_to_return = [
            results.IterationResult(
                success=True,
                duration=1,
                test_log='',
                metrics={},
                prompt=None,
                response=None,
            ),
            results.IterationResult(
                success=True,
                duration=1,
                test_log='',
                metrics={},
                prompt=None,
                response=None,
            ),
            results.IterationResult(
                success=False,
                duration=1,
                test_log='',
                metrics={},
                prompt=None,
                response=None,
            ),
        ]
        with mock.patch.object(workers.WorkerThread,
                               '_run_single_iteration',
                               side_effect=results_to_return):
            self._create_and_run_worker([config])

        self.assertEqual(self.test_result_queue.qsize(), 1)
        result = self.test_result_queue.get()
        self.assertFalse(result.success)
        self.assertEqual(result.successful_runs, 2)

    def test_early_exit_on_pass(self):
        """Tests that the test exits early when the pass threshold is met."""
        config = eval_config.TestConfig(test_file=pathlib.Path('/test/a.yaml'),
                                        runs_per_test=5,
                                        pass_k_threshold=2)
        results_to_return = [
            results.IterationResult(
                success=True,
                duration=1,
                test_log='',
                metrics={},
                prompt=None,
                response=None,
            ),
            results.IterationResult(
                success=True,
                duration=1,
                test_log='',
                metrics={},
                prompt=None,
                response=None,
            ),
        ]
        with mock.patch.object(workers.WorkerThread,
                               '_run_single_iteration',
                               side_effect=results_to_return) as mock_run:
            self._create_and_run_worker([config])
            self.assertEqual(mock_run.call_count, 2)

        self.assertEqual(self.test_result_queue.qsize(), 1)
        result = self.test_result_queue.get()
        self.assertTrue(result.success)

    def test_early_exit_on_fail(self):
        """Tests that the test exits early when it can no longer pass."""
        config = eval_config.TestConfig(test_file=pathlib.Path('/test/a.yaml'),
                                        runs_per_test=5,
                                        pass_k_threshold=3)
        results_to_return = [
            results.IterationResult(
                success=False,
                duration=1,
                test_log='',
                metrics={},
                prompt=None,
                response=None,
            ),
            results.IterationResult(
                success=False,
                duration=1,
                test_log='',
                metrics={},
                prompt=None,
                response=None,
            ),
            results.IterationResult(
                success=False,
                duration=1,
                test_log='',
                metrics={},
                prompt=None,
                response=None,
            ),
        ]
        with mock.patch.object(workers.WorkerThread,
                               '_run_single_iteration',
                               side_effect=results_to_return) as mock_run:
            self._create_and_run_worker([config])
            self.assertEqual(mock_run.call_count, 3)

        self.assertEqual(self.test_result_queue.qsize(), 1)
        result = self.test_result_queue.get()
        self.assertFalse(result.success)


class WorkerPoolUnittest(unittest.TestCase):
    """Unit tests for the WorkerPool class."""

    def setUp(self):
        self._setUpMocks()
        self._setUpPatches()

    def _setUpMocks(self):
        """Set up mocks for the tests."""
        self.mock_promptfoo = mock.Mock(
            spec=promptfoo_installation.PromptfooInstallation)
        self.worker_options = workers.WorkerOptions(
            clean=True,
            verbose=False,
            force=False,
            sandbox=False,
        )
        self.result_options = results.ResultOptions(
            print_output_on_success=False,
            result_handlers=[],
        )

    def _setUpPatches(self):
        """Set up patches for the tests."""

        def create_thread_join_side_effect(mock_thread):

            def thread_join_side_effect(*args, **kwargs):
                # pylint: disable=unused-argument
                mock_thread.is_alive.return_value = False

            return thread_join_side_effect

        atomic_counter_patcher = mock.patch('workers.results.AtomicCounter')
        self.mock_atomic_counter = atomic_counter_patcher.start()
        self.addCleanup(atomic_counter_patcher.stop)

        result_thread_patcher = mock.patch('workers.results.ResultThread')
        self.mock_result_thread = result_thread_patcher.start()
        mock_result_thread_instance = self.mock_result_thread.return_value
        mock_result_thread_instance.is_alive.return_value = True
        mock_result_thread_instance.join.side_effect = (
            create_thread_join_side_effect(mock_result_thread_instance))
        mock_result_thread_instance.total_results_reported = (
            self.mock_atomic_counter.return_value)
        mock_result_thread_instance.failed_result_output_queue = mock.Mock(
            spec=queue.Queue)
        self.addCleanup(result_thread_patcher.stop)

        worker_thread_patcher = mock.patch('workers.WorkerThread')
        self.mock_worker_thread = worker_thread_patcher.start()
        mock_worker_thread_instance = self.mock_worker_thread.return_value
        mock_worker_thread_instance.is_alive.return_value = True
        mock_worker_thread_instance.join.side_effect = (
            create_thread_join_side_effect(mock_worker_thread_instance))
        self.addCleanup(worker_thread_patcher.stop)

        polling_patcher = mock.patch(
            'workers._ALL_QUEUED_TESTS_RUN_POLLING_SLEEP_DURATION',
            _POLLING_INTERVAL)
        polling_patcher.start()
        self.addCleanup(polling_patcher.stop)

    def test_create_pool(self):
        """Tests that the pool is created with the correct number of workers."""
        pool = workers.WorkerPool(
            num_workers=3,
            promptfoo=self.mock_promptfoo,
            worker_options=self.worker_options,
            result_options=self.result_options,
        )
        self.assertEqual(self.mock_worker_thread.call_count, 3)
        self.mock_result_thread.assert_called_once()
        pool.shutdown_blocking(1)

    def test_queue_tests(self):
        """Tests that tests are queued correctly."""
        pool = workers.WorkerPool(
            num_workers=1,
            promptfoo=self.mock_promptfoo,
            worker_options=self.worker_options,
            result_options=self.result_options,
        )
        configs = [
            eval_config.TestConfig(test_file=pathlib.Path('/test/a.yaml')),
            eval_config.TestConfig(test_file=pathlib.Path('/test/b.yaml'))
        ]
        pool.queue_tests(configs)
        self.assertEqual(pool._test_input_queue.qsize(), 2)
        pool.shutdown_blocking(1)

    def test_wait_for_all_queued_tests(self):
        """Tests that the pool waits for all tests to complete."""
        self.mock_atomic_counter.return_value.get.side_effect = [0, 1, 2]
        pool = workers.WorkerPool(
            num_workers=1,
            promptfoo=self.mock_promptfoo,
            worker_options=self.worker_options,
            result_options=self.result_options,
        )
        configs = [
            eval_config.TestConfig(test_file=pathlib.Path('/test/a.yaml')),
            eval_config.TestConfig(test_file=pathlib.Path('/test/b.yaml'))
        ]
        pool.queue_tests(configs)
        failed_tests = pool.wait_for_all_queued_tests()
        self.assertEqual(len(failed_tests), 0)
        self.assertEqual(self.mock_atomic_counter.return_value.get.call_count,
                         3)
        pool.shutdown_blocking(1)

    def test_wait_for_all_queued_tests_with_failures(self):
        """Tests that failed tests are returned."""
        self.mock_atomic_counter.return_value.get.return_value = 1
        config = eval_config.TestConfig(test_file='fail.yaml')
        failed_test = results.TestResult(config=config,
                                         success=False,
                                         iteration_results=[
                                             results.IterationResult(
                                                 success=False,
                                                 duration=1,
                                                 test_log='',
                                                 metrics={},
                                                 prompt=None,
                                                 response=None,
                                             )
                                         ])
        mock_failed_queue = (
            self.mock_result_thread.return_value.failed_result_output_queue)
        mock_failed_queue.empty.side_effect = [False, True]
        mock_failed_queue.get.return_value = failed_test

        pool = workers.WorkerPool(
            num_workers=1,
            promptfoo=self.mock_promptfoo,
            worker_options=self.worker_options,
            result_options=self.result_options,
        )
        pool.queue_tests(
            [eval_config.TestConfig(test_file=pathlib.Path('fail.yaml'))])
        failed_tests = pool.wait_for_all_queued_tests()
        self.assertEqual(len(failed_tests), 1)
        self.assertEqual(failed_tests[0], failed_test)
        pool.shutdown_blocking(1)



    def test_shutdown_blocking(self):
        """Tests that shutdown_blocking shuts down all threads."""
        pool = workers.WorkerPool(
            num_workers=2,
            promptfoo=self.mock_promptfoo,
            worker_options=self.worker_options,
            result_options=self.result_options,
        )
        mock_workers = self.mock_worker_thread.return_value
        mock_result = self.mock_result_thread.return_value

        pool.shutdown_blocking(1)

        self.assertEqual(mock_workers.shutdown.call_count, 2)
        mock_result.shutdown.assert_called_once()
        self.assertEqual(mock_workers.join.call_count, 2)
        mock_result.join.assert_called_once()

    def test_shutdown_blocking_timeout(self):
        """Tests that shutdown_blocking logs an error on timeout."""
        pool = workers.WorkerPool(
            num_workers=1,
            promptfoo=self.mock_promptfoo,
            worker_options=self.worker_options,
            result_options=self.result_options,
        )
        self.mock_worker_thread.return_value.join.side_effect = None
        self.mock_worker_thread.return_value.is_alive.return_value = True
        with self.assertLogs(level='ERROR') as cm:
            pool.shutdown_blocking(0.01)
            self.assertIn('Failed to gracefully shut down thread',
                          cm.output[0])

    def test_wait_for_all_queued_tests_with_multiple_workers(self):
        """Tests that the pool waits for all tests with multiple workers."""
        self.mock_atomic_counter.return_value.get.side_effect = [0, 0, 1, 1, 2]
        pool = workers.WorkerPool(
            num_workers=2,
            promptfoo=self.mock_promptfoo,
            worker_options=self.worker_options,
            result_options=self.result_options,
        )
        test_paths = [
            pathlib.Path('/test/a.yaml'),
            pathlib.Path('/test/b.yaml')
        ]
        configs = [eval_config.TestConfig(test_file=p) for p in test_paths]
        pool.queue_tests(configs)
        failed_tests = pool.wait_for_all_queued_tests()
        self.assertEqual(len(failed_tests), 0)
        self.assertEqual(self.mock_atomic_counter.return_value.get.call_count,
                         5)
        pool.shutdown_blocking(1)

    def test_worker_thread_fatal_exception(self):
        """Tests that a fatal exception in a worker thread is propagated."""
        self.mock_worker_thread.return_value.maybe_reraise_fatal_exception. \
            side_effect = ValueError('Worker Error')
        pool = workers.WorkerPool(
            num_workers=1,
            promptfoo=self.mock_promptfoo,
            worker_options=self.worker_options,
            result_options=self.result_options,
        )
        pool.queue_tests(
            [eval_config.TestConfig(test_file=pathlib.Path('/test/a.yaml'))])
        with self.assertRaisesRegex(ValueError, 'Worker Error'):
            pool.wait_for_all_queued_tests()
        pool.shutdown_blocking(1)

    def test_result_thread_fatal_exception(self):
        """Tests that a fatal exception in the result thread is propagated."""
        self.mock_result_thread.return_value.maybe_reraise_fatal_exception. \
            side_effect = ValueError('Result Error')
        pool = workers.WorkerPool(
            num_workers=1,
            promptfoo=self.mock_promptfoo,
            worker_options=self.worker_options,
            result_options=self.result_options,
        )
        pool.queue_tests(
            [eval_config.TestConfig(test_file=pathlib.Path('/test/a.yaml'))])
        with self.assertRaisesRegex(ValueError, 'Result Error'):
            pool.wait_for_all_queued_tests()
        pool.shutdown_blocking(1)

    def test_del(self):
        """Tests that the destructor calls shutdown_blocking."""
        pool = workers.WorkerPool(
            num_workers=1,
            promptfoo=self.mock_promptfoo,
            worker_options=self.worker_options,
            result_options=self.result_options,
        )
        shutdown_mock = mock.Mock()
        pool.shutdown_blocking = shutdown_mock
        del pool
        shutdown_mock.assert_called_once_with(2)


if __name__ == '__main__':
    unittest.main()