msmodeling/tests/smoke/test_throughput_optimizer_smoke.py-代码预览-MindStudio-Modeling:基于 PyTorch 的神经网络推理性能模拟与分析框架项目 - AtomGit

ascend-robotfeat(serving_cast): support chunked prefill modeling
"""Smoke guard for throughput_optimizer CLI nightly regressions.

Nightly coverage mapping
------------------------
test_prefix_cache_hit_rate_aggregation_valid      - already present before this change
test_prefix_cache_hit_rate_disaggregation_*_valid - already present before this change
test_vl_model_image_args                          -> TestThroughputOptimizerNightly.test_vl_model_aggregation_with_output_validation
test_vl_disagg_prefill_smoke                      -> TestThroughputOptimizerNightly.test_vl_model_disaggregation_prefill_with_output_validation
test_vl_disagg_decode_smoke                       -> TestThroughputOptimizerNightly.test_vl_model_disaggregation_decode_with_output_validation
test_vl_moe_aggregation_compile_smoke             -> TestThroughputOptimizerNightly.test_VL_MOE_model_aggregation_with_output_validation
test_prefix_cache_with_max_batched_tokens_allows_chunked_prefill -> TestThroughputOptimizerNightly
                                                                    (test_prefix_cache_hit_rate_allows_chunked_prefill_when_effective_input_exceeds_max_batched_tokens)
test_deepseek_pd_ratio_mode                       -> TestThroughputOptimizerNightly
                                                     (test_deepseek_model_pd_ratio_with_output_validation)
"""

from unittest import TestCase

from tests.helpers.cli_runner import run_module_main

THROUGHPUT_OPTIMIZER_MODULE = "cli.inference.throughput_optimizer"


class TestThroughputOptimizerSmoke(TestCase):
    def _run_throughput_optimizer(self, args, check=True):
        result = run_module_main(THROUGHPUT_OPTIMIZER_MODULE, args)
        if check and result.returncode != 0:
            raise RuntimeError(f"throughput_optimizer failed (rc={result.returncode}): {result.stderr}")
        return result

    def test_prefix_cache_hit_rate_aggregation_valid(self):
        args = [
            "--input-length=64",
            "--output-length=16",
            "Qwen/Qwen3-32B",
            "--device=TEST_DEVICE",
            "--num-devices=1",
            "--jobs=1",
            "--tpot-limits=1000",
            "--batch-range",
            "1",
            "2",
            "--prefix-cache-hit-rate=0.5",
        ]

        result = self._run_throughput_optimizer(args, check=False)
        self.assertEqual(result.returncode, 0, msg=result.stderr)

    def test_prefix_cache_hit_rate_disaggregation_prefill_valid(self):
        args = [
            "--input-length=64",
            "--output-length=16",
            "Qwen/Qwen3-32B",
            "--device=TEST_DEVICE",
            "--num-devices=1",
            "--jobs=1",
            "--ttft-limits=1000",
            "--batch-range",
            "1",
            "2",
            "--prefix-cache-hit-rate=0.5",
            "--disagg",
        ]

        result = self._run_throughput_optimizer(args, check=False)
        self.assertEqual(result.returncode, 0, msg=result.stderr)

    def test_prefix_cache_hit_rate_disaggregation_decode_valid(self):
        args = [
            "--input-length=64",
            "--output-length=16",
            "Qwen/Qwen3-32B",
            "--device=TEST_DEVICE",
            "--num-devices=1",
            "--jobs=1",
            "--tpot-limits=1000",
            "--batch-range",
            "1",
            "2",
            "--prefix-cache-hit-rate=0.5",
            "--disagg",
        ]

        result = self._run_throughput_optimizer(args, check=False)
        self.assertEqual(result.returncode, 0, msg=result.stderr)

    def test_vl_model_image_args(self):
        """VL model aggregation with image args; guards test_vl_model_aggregation_with_output_validation."""
        args = [
            "--input-length=64",
            "--output-length=16",
            "Qwen/Qwen3-VL-30B-A3B-Instruct",
            "--device=TEST_DEVICE",
            "--num-devices=4",
            "--jobs=1",
            "--tpot-limits=10000",
            "--batch-range",
            "1",
            "2",
            "--image-height=224",
            "--image-width=224",
        ]
        result = self._run_throughput_optimizer(args, check=False)
        self.assertEqual(result.returncode, 0, msg=result.stderr)

    def test_vl_disagg_prefill_smoke(self):
        """VL disagg prefill; guards test_vl_model_disaggregation_prefill_with_output_validation."""
        args = [
            "--input-length=64",
            "--output-length=16",
            "Qwen/Qwen3-VL-30B-A3B-Instruct",
            "--device=TEST_DEVICE",
            "--num-devices=4",
            "--jobs=1",
            "--ttft-limits=10000",
            "--batch-range",
            "1",
            "2",
            "--image-height=224",
            "--image-width=224",
            "--disagg",
        ]
        result = self._run_throughput_optimizer(args, check=False)
        self.assertEqual(result.returncode, 0, msg=result.stderr)

    def test_vl_disagg_decode_smoke(self):
        """VL disagg decode; guards test_vl_model_disaggregation_decode_with_output_validation."""
        args = [
            "--input-length=64",
            "--output-length=16",
            "zai-org/GLM-4.5V",
            "--device=TEST_DEVICE",
            "--num-devices=4",
            "--jobs=1",
            "--tpot-limits=10000",
            "--image-height=224",
            "--image-width=224",
            "--disagg",
        ]
        result = self._run_throughput_optimizer(args, check=False)
        self.assertEqual(result.returncode, 0, msg=result.stderr)

    def test_vl_moe_aggregation_compile_smoke(self):
        """VL MOE + compile aggregation; guards test_VL_MOE_model_aggregation_with_output_validation.

        Uses Qwen3-VL-30B (not nightly 235B) to keep PR smoke under time budget.
        """
        args = [
            "--input-length=16",
            "--output-length=8",
            "Qwen/Qwen3-VL-30B-A3B-Instruct",
            "--device=TEST_DEVICE",
            "--num-devices=2",
            "--jobs=1",
            "--tpot-limits=10000",
            "--image-height=224",
            "--image-width=224",
            "--compile",
            "--quantize-linear-action=W8A8_DYNAMIC",
            "--batch-range",
            "1",
            "1",
            "--max-batched-tokens=16",
        ]
        result = self._run_throughput_optimizer(args, check=False)
        self.assertEqual(result.returncode, 0, msg=result.stderr)

    def test_prefix_cache_with_max_batched_tokens_allows_chunked_prefill(self):
        """prefix-cache hit-rate + max-batched-tokens can use chunked prefill.

        Guards test_prefix_cache_hit_rate_allows_chunked_prefill_when_effective_input_exceeds_max_batched_tokens.
        With input_length=200 and prefix_cache_hit_rate=0.5, effective_input_length=100.
        max_batched_tokens=99 < 100, so the CLI should model two prefill chunks.
        """
        args = [
            "--input-length=200",
            "--output-length=16",
            "Qwen/Qwen3-32B",
            "--device=TEST_DEVICE",
            "--num-devices=1",
            "--jobs=1",
            "--tpot-limits=1000",
            "--batch-range",
            "1",
            "2",
            "--prefix-cache-hit-rate=0.5",
            "--max-batched-tokens=99",
        ]
        result = self._run_throughput_optimizer(args, check=False)
        self.assertEqual(result.returncode, 0, msg=result.stderr)

    def test_deepseek_pd_ratio_mode(self):
        """PD-ratio optimization mode; guards DeepSeek PD-ratio nightly regression."""
        args = [
            "--input-length=64",
            "--output-length=16",
            "deepseek-ai/DeepSeek-V3.1",
            "--enable-optimize-prefill-decode-ratio",
            "--prefill-devices-per-instance=4",
            "--decode-devices-per-instance=4",
            "--device=TEST_DEVICE",
            "--jobs=1",
            "--ttft-limits=10000",
            "--tpot-limits=10000",
        ]
        result = self._run_throughput_optimizer(args, check=False)
        self.assertEqual(result.returncode, 0, msg=result.stderr)