e2e-auto-test/e2e/installation/installation_test.go-代码预览-e2e-auto-test:基于 pytest 与 Playwright 的自动化测试项目 - AtomGit

Lluolong817fix: fix delete check and skip
package installation

import (
	"fmt"
	"path/filepath"
	"strings"
	"time"

	. "github.com/onsi/ginkgo/v2"
	. "github.com/onsi/gomega"

	"gitcode.com/openFuyao/e2e-auto-test/e2e/framework/executor"
	config "gitcode.com/openFuyao/e2e-auto-test/e2e/installation/bke-config"
	"gitcode.com/openFuyao/e2e-auto-test/e2e/installation/utils"

	"k8s.io/client-go/dynamic"
	"k8s.io/client-go/kubernetes"
	"k8s.io/client-go/tools/clientcmd"
	"k8s.io/client-go/util/homedir"
)

const InstallationItTimeout = 45 * time.Minute

var _ = SIGDescribe("BKE Cluster Installation", func() {
	// InstallationItTimeout is the default Spec timeout for tests under e2e/installatio

	var (
		sshExecutor     *executor.SSHExecutor
		localExecutor   *executor.LocalExecutor
		guideConfig     *config.GuideNodeConfig
		dynamicClient   dynamic.Interface
		clientSet       *kubernetes.Clientset
		clusterManager  *utils.ClusterManager
		resourceManager *utils.ResourceManager
	)

	BeforeEach(func() {
		// 加载引导节点配置
		guideConfig = config.LoadGuideNodeFromEnv()
		Expect(guideConfig.Host).NotTo(BeEmpty(), "GUIDE_NODE_HOST 环境变量必须设置")
		Expect(guideConfig.Password).NotTo(BeEmpty(), "GUIDE_NODE_PASSWORD 环境变量必须设置")

		// 创建 SSH 执行器连接引导节点
		var err error
		sshExecutor, err = executor.NewSSHExecutor(
			guideConfig.Host,
			guideConfig.Port,
			guideConfig.Username,
			guideConfig.Password,
		)
		Expect(err).NotTo(HaveOccurred(), "应该成功连接到引导节点")

		// 创建 K8s 动态客户端（连接引导集群）
		kubeconfig := filepath.Join(homedir.HomeDir(), ".kube", "config")
		restConfig, err := clientcmd.BuildConfigFromFlags("", kubeconfig)
		Expect(err).NotTo(HaveOccurred(), "应该成功加载 kubeconfig")

		dynamicClient, err = dynamic.NewForConfig(restConfig)
		Expect(err).NotTo(HaveOccurred(), "应该成功创建动态客户端")

		// 创建集群管理器
		clusterManager = utils.NewClusterManager(sshExecutor, dynamicClient)

		// 创建资源管理器
		clientSet, err = kubernetes.NewForConfig(restConfig)
		Expect(err).NotTo(HaveOccurred())
		resourceManager = utils.NewResourceManager(sshExecutor, dynamicClient, clientSet)

		// 创建本地执行器用于证书验证（通过引导节点连接到目标节点）
		localExecutor = executor.NewLocalExecutor(30 * time.Second)
	})

	Describe("创建 1Master 集群并断 1Master节点的网络", Label("1master", "network-disconnect", "post-init", "skip-temporarily"), func() {
		var (
			clusterName    string
			clusterConfig  *config.BKEClusterConfig
			configPath     string
			nodeConfigPath string
			masterNode     config.NodeInfo
			backupFile     string
		)

		BeforeEach(func() {
			clusterName = fmt.Sprintf("test-1m-disconnect-%d", time.Now().Unix())

			// 加载节点配置
			nodes := config.LoadTestNodesFromEnv()
			Expect(len(nodes)).To(BeNumerically(">=", 1), "1Master 集群需要至少 1 个节点")

			// 使用第一个节点作为 Master
			masterNode = nodes[0]
			masterNode.Role = []string{"master/node", "etcd"}

			// 创建集群配置
			clusterConfig = config.NewDefaultBKEClusterConfig(clusterName, []config.NodeInfo{masterNode})

			// 使用辅助函数断开 master 节点的网络
			var err error
			backupFile, err = DisconnectNetwork(masterNode, localExecutor)
			Expect(err).NotTo(HaveOccurred(), "应该成功断开 master 节点的网络")
		})

		AfterEach(func() {
			// 使用辅助函数恢复 master 节点的网络
			err := RestoreNetwork(masterNode, localExecutor, backupFile)
			if err != nil {
				GinkgoWriter.Printf("恢复 master 节点网络失败: %v\n", err)
			}

			// 清理集群
			By("清理测试集群")
			if clusterManager.ClusterExistsWithKubeconfig(clusterName, "") {
				By("触发集群删除")
				err := clusterManager.DeleteClusterWithKubeconfig(clusterName, "")
				if err != nil {
					GinkgoWriter.Printf("触发集群删除失败: %v\n", err)
				}

				// 等待集群完全删除
				By("等待集群完全删除")
				Eventually(func() bool {
					exists := clusterManager.ClusterExistsWithKubeconfig(clusterName, "")
					if exists {
						phase, _ := clusterManager.GetClusterPhaseWithKubeconfig(clusterName, "")
						GinkgoWriter.Printf("当前集群 phase: %s\n", phase)
					}
					return !exists
				}, uninstallTimeout, 60*time.Second).Should(BeTrue(), "集群应该被完全删除")
			}

			// 清理配置文件
			if configPath != "" {
				if err := clusterManager.CleanupConfig(configPath); err != nil {
					GinkgoWriter.Printf("清理配置文件失败: %v\n", err)
				}
			}
			if nodeConfigPath != "" {
				if err := clusterManager.CleanupConfig(nodeConfigPath); err != nil {
					GinkgoWriter.Printf("Failed to cleanup node config file: %v\n", err)
				}
			}
		})

		// 用例名称：当 master 节点网络断开时应该阻止集群创建
		// 用例步骤：1) 通过引导节点生成并上传集群配置文件；2) 在 master 节点网络断开场景下执行 `bke cluster create`；3) 轮询集群状态直到确认创建失败（DeployFailed/InitializationFailed）。
		// 预期结果：最终集群状态不为 Healthy，且集群创建被正确阻止。
		It("当 master 节点网络断开时应该阻止集群创建", SpecTimeout(InstallationItTimeout), func(ctx SpecContext) {
			By("生成集群配置文件")
			var err error
			configPath, nodeConfigPath, err = clusterManager.GetConfigGenerator().GenerateAndUpload(clusterConfig)
			Expect(err).NotTo(HaveOccurred(), "应该成功生成配置文件")
			GinkgoWriter.Printf("配置文件路径: %s\n", configPath)

			By("验证配置文件已上传")
			err = clusterManager.GetConfigGenerator().ValidateConfigOnNode(configPath)
			Expect(err).NotTo(HaveOccurred(), "配置文件应该存在于引导节点")

			By("执行 bke cluster create 命令")
			err = clusterManager.CreateClusterInBackgroundWithKubeconfig(configPath, nodeConfigPath, "")
			Expect(err).NotTo(HaveOccurred(), "创建集群命令应该成功执行")

			By("等待并验证集群创建被阻止")
			// 等待集群创建失败，状态为 DeployFailed 和 InitializationFailed
			checkInterval := 30 * time.Second
			checkTimeout := 10 * time.Minute // 检查 10 分钟以验证被阻止
			Eventually(func() bool {
				phase, state, clusterStatus, _ := clusterManager.GetClusterFullStatusWithKubeconfig(clusterName, "")
				GinkgoWriter.Printf("当前集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)

				if state == "Healthy" {
					// 如果集群变为 Healthy，说明没有被阻止 - 这是意外的
					Fail(fmt.Sprintf("集群意外变为 Healthy。预期集群创建被阻止，但集群状态为: %s", state))
				}

				// 检查状态是否为 DeployFailed 且 clusterStatus 为 InitializationFailed
				// 由于现在当 clusterStatus 长时间为 InitializationFailed，state也不会变为 DeployFailed，修改退出逻辑
				return state == "DeployFailed" || clusterStatus == "InitializationFailed"
			}, checkTimeout, checkInterval).Should(BeTrue(), "集群创建应该被阻止，状态为 DeployFailed 和 InitializationFailed")

			// 最终状态检查
			finalState, err := clusterManager.GetClusterStatusWithKubeconfig(clusterName, "")
			Expect(err).NotTo(HaveOccurred())
			Expect(finalState).NotTo(Equal("Healthy"), "当 master 节点网络断开时，集群不应该是 Healthy")
			GinkgoWriter.Printf("最终集群状态: %s (预期被阻止，不是 Healthy)\n", finalState)
		})
	})

	Describe("创建 1Master1Worker 业务集群并断开 Worker 节点网络", Label("1master1worker", "network-disconnect", "post-init", "skip-temporarily"), func() {
		var (
			clusterName    string
			clusterConfig  *config.BKEClusterConfig
			configPath     string
			nodeConfigPath string
			workerNode     config.NodeInfo
			backupFile     string
		)

		BeforeEach(func() {
			clusterName = fmt.Sprintf("test-1m1w-disconnect-worker-%d", time.Now().Unix())

			// 加载节点配置
			nodes := config.LoadTestNodesFromEnv()
			Expect(len(nodes)).To(BeNumerically(">=", 2), "1Master1Worker 集群需要至少 2 个节点")

			// 使用第一个节点作为 Master
			masterNode := nodes[0]
			masterNode.Role = []string{"master/node", "etcd"}

			// 使用第二个节点作为 Worker
			workerNode = nodes[1]
			workerNode.Role = []string{"node"}

			// 创建集群配置（业务集群不带 cluster-api，默认排除 cluster-api）
			clusterConfig = config.NewDefaultBKEClusterConfig(clusterName, []config.NodeInfo{masterNode, workerNode})

			// 使用辅助函数断开 worker 节点的网络
			var err error
			backupFile, err = DisconnectNetwork(workerNode, localExecutor)
			Expect(err).NotTo(HaveOccurred(), "应该成功断开 worker 节点的网络")
		})

		AfterEach(func() {
			// 使用辅助函数恢复 worker 节点的网络
			err := RestoreNetwork(workerNode, localExecutor, backupFile)
			if err != nil {
				GinkgoWriter.Printf("恢复 worker 节点网络失败: %v\n", err)
			}

			// 清理集群
			By("清理测试集群")
			if clusterManager.ClusterExistsWithKubeconfig(clusterName, "") {
				By("触发集群删除")
				err := clusterManager.DeleteClusterWithKubeconfig(clusterName, "")
				if err != nil {
					GinkgoWriter.Printf("触发集群删除失败: %v\n", err)
				}

				// 等待集群完全删除
				By("等待集群完全删除")
				Eventually(func() bool {
					exists := clusterManager.ClusterExistsWithKubeconfig(clusterName, "")
					if exists {
						phase, _ := clusterManager.GetClusterPhaseWithKubeconfig(clusterName, "")
						GinkgoWriter.Printf("当前集群 phase: %s\n", phase)
					}
					return !exists
				}, uninstallTimeout, 60*time.Second).Should(BeTrue(), "集群应该被完全删除")
			}

			// 清理配置文件
			if configPath != "" {
				if err := clusterManager.CleanupConfig(configPath); err != nil {
					GinkgoWriter.Printf("清理配置文件失败: %v\n", err)
				}
			}
			if nodeConfigPath != "" {
				if err := clusterManager.CleanupConfig(nodeConfigPath); err != nil {
					GinkgoWriter.Printf("Failed to cleanup node config file: %v\n", err)
				}
			}
		})

		// 用例名称：当 worker 节点网络断开时应该成功创建集群并保持 Healthy
		// 用例步骤：1) 通过引导节点生成并上传集群配置并校验配置已在引导节点落地；2) 执行 `bke cluster create` 创建集群；3) 等待集群达到 Healthy 且 Ready。
		// 预期结果：即使 worker 网络断开，集群仍可完成初始化并保持 `state=Healthy`、`clusterStatus=Ready`。
		It("当 worker 节点网络断开时应该成功创建集群并保持 Healthy", SpecTimeout(InstallationItTimeout), func(ctx SpecContext) {
			By("生成集群配置文件")
			var err error
			configPath, nodeConfigPath, err = clusterManager.GetConfigGenerator().GenerateAndUpload(clusterConfig)
			Expect(err).NotTo(HaveOccurred(), "应该成功生成配置文件")
			GinkgoWriter.Printf("配置文件路径: %s\n", configPath)

			By("验证配置文件已上传")
			err = clusterManager.GetConfigGenerator().ValidateConfigOnNode(configPath)
			Expect(err).NotTo(HaveOccurred(), "配置文件应该存在于引导节点")

			By("执行 bke cluster create 命令")
			err = clusterManager.CreateClusterInBackgroundWithKubeconfig(configPath, nodeConfigPath, "")
			Expect(err).NotTo(HaveOccurred(), "创建集群命令应该成功执行")

			By("等待集群状态变为 Healthy 和 Ready")
			Eventually(func() bool {
				phase, state, clusterStatus, _ := clusterManager.GetClusterFullStatusWithKubeconfig(clusterName, "")
				GinkgoWriter.Printf("当前集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
				if state == "DeployFailed" {
					Fail(fmt.Sprintf("集群创建失败，状态为 DeployFailed。Phase: %s, clusterStatus: %s", phase, clusterStatus))
				}
				return state == "Healthy" && clusterStatus == "Ready"
			}, installTimeout, pollInterval).Should(BeTrue(), "即使 worker 节点网络断开，集群也应该变为 Healthy 和 Ready 状态")

			By("验证集群最终状态")
			phase, state, clusterStatus, err := clusterManager.GetClusterFullStatusWithKubeconfig(clusterName, "")
			Expect(err).NotTo(HaveOccurred())
			GinkgoWriter.Printf("最终集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
			Expect(state).To(Equal("Healthy"), "当 worker 节点网络断开时，集群状态应该是 Healthy")
			Expect(clusterStatus).To(Equal("Ready"), "集群状态应该是 Ready")
		})
	})

	Describe("创建 1Master1Worker 集群并断开 Worker 节点网络后缩容", Label("1m1w-delete-disconnect-1w", "network-disconnect", "post-init", "skip-temporarily"), func() {
		var (
			clusterName    string
			clusterConfig  *config.BKEClusterConfig
			configPath     string
			nodeConfigPath string
			workerNode     config.NodeInfo
			backupFile     string
		)

		BeforeEach(func() {
			clusterName = fmt.Sprintf("test-1m1w-disconnect-worker-scalein-%d", time.Now().Unix())

			// 加载节点配置
			nodes := config.LoadTestNodesFromEnv()
			Expect(len(nodes)).To(BeNumerically(">=", 2), "1Master1Worker 集群需要至少 2 个节点")

			// 使用第一个节点作为 Master
			masterNode := nodes[0]
			masterNode.Role = []string{"master/node", "etcd"}

			// 使用第二个节点作为 Worker
			workerNode = nodes[1]
			workerNode.Role = []string{"node"}

			// 创建集群配置（业务集群不带 cluster-api，默认排除 cluster-api）
			clusterConfig = config.NewDefaultBKEClusterConfig(clusterName, []config.NodeInfo{masterNode, workerNode})

			// 使用辅助函数断开 worker 节点的网络
			var err error
			backupFile, err = DisconnectNetwork(workerNode, localExecutor)
			Expect(err).NotTo(HaveOccurred(), "应该成功断开 worker 节点的网络")
		})

		AfterEach(func() {
			// 使用辅助函数恢复 worker 节点的网络
			err := RestoreNetwork(workerNode, localExecutor, backupFile)
			if err != nil {
				GinkgoWriter.Printf("恢复 worker 节点网络失败: %v\n", err)
			}

			// 清理集群
			By("清理测试集群")
			if clusterManager.ClusterExistsWithKubeconfig(clusterName, "") {
				By("触发集群删除")
				err := clusterManager.DeleteClusterWithKubeconfig(clusterName, "")
				if err != nil {
					GinkgoWriter.Printf("触发集群删除失败: %v\n", err)
				}

				// 等待集群完全删除
				By("等待集群完全删除")
				Eventually(func() bool {
					exists := clusterManager.ClusterExistsWithKubeconfig(clusterName, "")
					if exists {
						phase, _ := clusterManager.GetClusterPhaseWithKubeconfig(clusterName, "")
						GinkgoWriter.Printf("当前集群 phase: %s\n", phase)
					}
					return !exists
				}, uninstallTimeout, 60*time.Second).Should(BeTrue(), "集群应该被完全删除")
			}

			// 清理配置文件
			if configPath != "" {
				if err := clusterManager.CleanupConfig(configPath); err != nil {
					GinkgoWriter.Printf("清理配置文件失败: %v\n", err)
				}
			}
			if nodeConfigPath != "" {
				if err := clusterManager.CleanupConfig(nodeConfigPath); err != nil {
					GinkgoWriter.Printf("Failed to cleanup node config file: %v\n", err)
				}
			}
		})

		// 用例名称：当 worker 节点网络断开时应该成功创建集群并保持 Healthy，然后缩容 worker 节点
		// 用例步骤：1) 通过引导节点创建集群并等待 Healthy/Ready；2) 在缩容前再次确认 Healthy/Ready；3) 执行缩容移除 worker 并等待最终 Healthy/Ready，必要时校验节点数量回落。
		// 预期结果：缩容后集群仍保持可用，`state` 为 Healthy，`clusterStatus` 为 Ready（且节点数量符合预期）。
		It("当 worker 节点网络断开时应该成功创建集群并保持 Healthy，然后缩容 worker 节点", SpecTimeout(InstallationItTimeout), func(ctx SpecContext) {
			By("生成集群配置文件")
			var err error
			configPath, nodeConfigPath, err = clusterManager.GetConfigGenerator().GenerateAndUpload(clusterConfig)
			Expect(err).NotTo(HaveOccurred(), "应该成功生成配置文件")
			GinkgoWriter.Printf("配置文件路径: %s\n", configPath)

			By("验证配置文件已上传")
			err = clusterManager.GetConfigGenerator().ValidateConfigOnNode(configPath)
			Expect(err).NotTo(HaveOccurred(), "配置文件应该存在于引导节点")

			By("执行 bke cluster create 命令")
			err = clusterManager.CreateClusterInBackgroundWithKubeconfig(configPath, nodeConfigPath, "")
			Expect(err).NotTo(HaveOccurred(), "创建集群命令应该成功执行")

			By("等待集群状态变为 Healthy 和 Ready")
			Eventually(func() bool {
				phase, state, clusterStatus, _ := clusterManager.GetClusterFullStatusWithKubeconfig(clusterName, "")
				GinkgoWriter.Printf("当前集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
				if state == "DeployFailed" {
					Fail(fmt.Sprintf("集群创建失败，状态为 DeployFailed。Phase: %s, clusterStatus: %s", phase, clusterStatus))
				}
				return state == "Healthy" && clusterStatus == "Ready"
			}, installTimeout, pollInterval).Should(BeTrue(), "即使 worker 节点网络断开，集群也应该变为 Healthy 和 Ready 状态")

			By("验证集群在缩容前是 Healthy 和 Ready")
			phase, state, clusterStatus, err := clusterManager.GetClusterFullStatusWithKubeconfig(clusterName, "")
			Expect(err).NotTo(HaveOccurred())
			Expect(state).To(Equal("Healthy"), "缩容前集群状态应该是 Healthy")
			Expect(clusterStatus).To(Equal("Ready"), "缩容前集群状态应该是 Ready")
			GinkgoWriter.Printf("缩容前集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)

			// 使用现有的 ScaleInNode 函数缩容 worker 节点
			By(fmt.Sprintf("缩容 worker 节点: %s", workerNode.IP))
			err = clusterManager.ScaleInNode(clusterName, workerNode.IP)
			Expect(err).NotTo(HaveOccurred(), "应该成功缩容 worker 节点")

			// 步骤 4: 等待缩容后集群变为 Ready 和 Healthy
			By("等待缩容后集群变为 Ready 和 Healthy")
			Eventually(func() bool {
				phase, state, clusterStatus, err := clusterManager.GetClusterFullStatusWithKubeconfig(clusterName, "")
				if err != nil {
					GinkgoWriter.Printf("获取集群完整状态失败: %v\n", err)
					return false
				}
				GinkgoWriter.Printf("缩容后当前集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
				failOnClusterFailure(state, clusterStatus)
				return state == "Healthy" && clusterStatus == "Ready"
			}, installTimeout, pollInterval).Should(BeTrue(), "缩容后集群应该变为 Healthy 和 Ready")

			// 验证最终状态
			By("验证缩容后集群最终状态")
			phase, state, clusterStatus, err = clusterManager.GetClusterFullStatusWithKubeconfig(clusterName, "")
			Expect(err).NotTo(HaveOccurred())
			GinkgoWriter.Printf("缩容后最终集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
			Expect(state).To(Equal("Healthy"), "缩容后集群状态应该是 Healthy")
			Expect(clusterStatus).To(Equal("Ready"), "缩容后集群状态应该是 Ready")

			// 验证节点数量减少到 1（只有 master 节点）
			By("验证缩容后节点数量减少到 1")
			checker := utils.NewClusterCheckerWithParentKubeconfig(sshExecutor, clusterName, "")
			count, err := checker.GetNodeCountWithKubeconfig("")
			Expect(err).NotTo(HaveOccurred(), "应该成功获取节点数量")
			Expect(count).To(Equal(1), "缩容后应该有 1 个节点（只有 master 节点）")
			GinkgoWriter.Printf("缩容后节点数量: %d (预期: 1)\n", count)
		})
	})

	Describe("创建 1Master 管理集群和 1Master1Worker 业务集群并断开 Worker 节点网络", Label("1m-mgmt-1m1w-workload", "network-disconnect", "post-init", "skip-temporarily"), Ordered, func() {
		var (
			mgmtClusterName        string
			mgmtClusterConfig      *config.BKEClusterConfig
			mgmtConfigPath         string
			mgmtNodeConfigPath     string
			mgmtKubeconfigPath     string
			workloadClusterName    string
			workloadClusterConfig  *config.BKEClusterConfig
			workloadConfigPath     string
			workloadNodeConfigPath string
			workerNode             config.NodeInfo
		)

		BeforeAll(func() {
			By("在引导集群上创建 1Master 管理集群")
			mgmtClusterName = fmt.Sprintf("test-mgmt-1m-%d", time.Now().Unix())

			nodes := config.LoadTestNodesFromEnv()
			Expect(len(nodes)).To(BeNumerically(">=", 1), "1Master 管理集群需要至少 1 个节点")

			// 第一个节点作为 Master
			masterNode := nodes[0]
			masterNode.Role = []string{"master/node", "etcd"}

			// 创建管理集群配置（管理集群需要 cluster-api 来管理业务集群）
			// 包含 cluster-api 但排除 openfuyao-system-controller（与证书测试用例相同）
			mgmtClusterConfig = config.NewBKEClusterConfigForMgmt(mgmtClusterName, []config.NodeInfo{masterNode})

			By("生成管理集群配置文件")
			var err error
			mgmtConfigPath, mgmtNodeConfigPath, err = clusterManager.GetConfigGenerator().GenerateAndUpload(mgmtClusterConfig)
			Expect(err).NotTo(HaveOccurred(), "应该成功生成管理集群配置文件")

			By("执行管理集群的 bke cluster create 命令")
			err = clusterManager.CreateClusterInBackgroundWithKubeconfig(mgmtConfigPath, mgmtNodeConfigPath, "")
			Expect(err).NotTo(HaveOccurred(), "管理集群创建命令应该成功执行")

			By("等待管理集群状态变为 Healthy 和 Ready")
			Eventually(func() bool {
				phase, state, clusterStatus, _ := clusterManager.GetClusterFullStatusWithKubeconfig(mgmtClusterName, "")
				GinkgoWriter.Printf("当前管理集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
				if state == "DeployFailed" {
					Fail(fmt.Sprintf("管理集群创建失败，状态为 DeployFailed。Phase: %s, clusterStatus: %s", phase, clusterStatus))
				}
				return state == "Healthy" && clusterStatus == "Ready"
			}, mgmtInstallTimeout, pollInterval).Should(BeTrue(), "管理集群应该变为 Healthy 和 Ready 状态")

			By("验证管理集群最终状态")
			phase, state, clusterStatus, err := clusterManager.GetClusterFullStatusWithKubeconfig(mgmtClusterName, "")
			Expect(err).NotTo(HaveOccurred())
			GinkgoWriter.Printf("最终管理集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
			Expect(state).To(Equal("Healthy"), "管理集群状态应该是 Healthy")
			Expect(clusterStatus).To(Equal("Ready"), "管理集群状态应该是 Ready")

			By("获取管理集群 kubeconfig")
			mgmtChecker := utils.NewClusterCheckerWithParentKubeconfig(localExecutor, mgmtClusterName, "")
			mgmtKubeconfigPath, err = mgmtChecker.SaveKubeconfigToFile()
			Expect(err).NotTo(HaveOccurred(), "应该成功获取管理集群 kubeconfig")
			GinkgoWriter.Printf("管理集群 kubeconfig 保存到: %s\n", mgmtKubeconfigPath)

			By("准备业务集群配置并断开 worker 节点网络")
			workloadClusterName = fmt.Sprintf("test-workload-1m1w-disconnect-worker-%d", time.Now().Unix())

			// 管理集群使用 nodes[0]，所以业务集群需要至少 nodes[1] 和 nodes[2]
			Expect(len(nodes)).To(BeNumerically(">=", 3), "1Master1Worker 业务集群需要至少 3 个节点（管理集群使用节点 1，业务集群需要节点 2-3）")

			// 使用节点 2 作为 Master (nodes[1])
			workloadMasterNode := nodes[1]
			workloadMasterNode.Role = []string{"master/node", "etcd"}

			// 使用节点 3 作为 Worker (nodes[2])
			workerNode = nodes[2]
			workerNode.Role = []string{"node"}

			// 创建业务集群配置（业务集群不带 cluster-api，默认排除 cluster-api）
			workloadClusterConfig = config.NewDefaultBKEClusterConfig(workloadClusterName, []config.NodeInfo{workloadMasterNode, workerNode})

			// 使用辅助函数断开 worker 节点网络
			_, err = DisconnectNetwork(workerNode, localExecutor)
			Expect(err).NotTo(HaveOccurred(), "应该成功断开 worker 节点的网络")
		})

		AfterAll(func() {
			// 使用辅助函数恢复 worker 节点的网络
			if workerNode.IP != "" {
				err := RestoreNetwork(workerNode, localExecutor, "")
				if err != nil {
					GinkgoWriter.Printf("恢复 worker 节点网络失败: %v\n", err)
				}
			}

			// 清理业务集群
			By("清理业务集群")
			if workloadClusterName != "" && mgmtKubeconfigPath != "" {
				if clusterManager.ClusterExistsWithKubeconfig(workloadClusterName, mgmtKubeconfigPath) {
					if err := clusterManager.DeleteClusterWithKubeconfig(workloadClusterName, mgmtKubeconfigPath); err != nil {
						GinkgoWriter.Printf("删除业务集群失败: %v\n", err)
					}
					Eventually(func() bool {
						return !clusterManager.ClusterExistsWithKubeconfig(workloadClusterName, mgmtKubeconfigPath)
					}, uninstallTimeout, 10*time.Second).Should(BeTrue())
				}
				if workloadConfigPath != "" {
					if err := clusterManager.CleanupConfig(workloadConfigPath); err != nil {
						GinkgoWriter.Printf("清理业务集群配置文件失败: %v\n", err)
					}
				}
				if workloadNodeConfigPath != "" {
					if err := clusterManager.CleanupConfig(workloadNodeConfigPath); err != nil {
						GinkgoWriter.Printf("Failed to cleanup workload node config file: %v\n", err)
					}
				}
			}

			// 清理管理集群
			By("清理管理集群")
			// 首先获取管理集群 kubeconfig（删除自身 BC 需要）
			mgmtChecker := utils.NewClusterCheckerWithParentKubeconfig(localExecutor, mgmtClusterName, "")
			mgmtKubeconfigPathForDelete, err := mgmtChecker.SaveKubeconfigToFile()
			if err != nil {
				GinkgoWriter.Printf("[DEBUG] 获取管理集群 kubeconfig 失败: %v\n", err)
				// 使用 BeforeAll 中的 mgmtKubeconfigPath 作为后备
				if mgmtKubeconfigPath != "" {
					mgmtKubeconfigPathForDelete = mgmtKubeconfigPath
				}
			} else if mgmtKubeconfigPath != "" {
				// 清理刚刚创建的临时 kubeconfig
				defer localExecutor.Exec(fmt.Sprintf("rm -f %s", mgmtKubeconfigPathForDelete))
				mgmtKubeconfigPathForDelete = mgmtKubeconfigPath
			}

			// 步骤 1: 在引导集群上删除 BC 资源（管理集群创建的地方）
			if clusterManager.ClusterExistsWithKubeconfig(mgmtClusterName, "") {
				By("在引导集群上触发管理集群删除")
				err := clusterManager.DeleteClusterWithKubeconfig(mgmtClusterName, "")
				if err != nil {
					GinkgoWriter.Printf("在引导集群上触发管理集群删除失败: %v\n", err)
				}
			} else {
				GinkgoWriter.Printf("[DEBUG] 引导集群上不存在管理集群 BC\n")
			}

			// 步骤 2: 在管理集群本身上删除 BC 资源（固定名称: bke-cluster）
			if mgmtKubeconfigPathForDelete != "" {
				By("在管理集群上触发管理集群自身 BC 删除")
				err := clusterManager.DeleteManagementClusterSelfBC(mgmtKubeconfigPathForDelete)
				if err != nil {
					GinkgoWriter.Printf("触发管理集群自身 BC 删除失败: %v\n", err)
				}
			}

			// 步骤 3: 等待两个 BC 资源都被删除
			By("等待管理集群完全删除")
			Eventually(func() bool {
				// 检查引导集群上的 BC
				existsOnBootstrap := clusterManager.ClusterExistsWithKubeconfig(mgmtClusterName, "")

				// 检查管理集群本身的 BC
				existsOnSelf := false
				if mgmtKubeconfigPathForDelete != "" {
					// 管理集群自身 BC 的固定名称为 "bke-cluster"，命名空间为 "bke-cluster"
					bcName := "bke-cluster"
					namespace := "bke-cluster"
					checkSelfCmd := fmt.Sprintf("KUBECONFIG=%s kubectl get bc %s -n %s --no-headers 2>/dev/null", mgmtKubeconfigPathForDelete, bcName, namespace)
					result, _ := localExecutor.Exec(checkSelfCmd)
					existsOnSelf = result.ExitCode == 0 && strings.TrimSpace(result.Stdout) != ""

					if !existsOnSelf {
						// BC 不存在时记录调试信息
						GinkgoWriter.Printf("[DEBUG] 管理集群自身 BC (bke-cluster) 未找到: exit_code=%d\n", result.ExitCode)
					}
				}

				if existsOnBootstrap || existsOnSelf {
					if existsOnBootstrap {
						phase, state, clusterStatus, _ := clusterManager.GetClusterFullStatusWithKubeconfig(mgmtClusterName, "")
						GinkgoWriter.Printf("[DEBUG] 引导集群上的管理集群 BC 仍存在: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
					}
					if existsOnSelf {
						GinkgoWriter.Printf("[DEBUG] 管理集群自身 BC (bke-cluster) 在管理集群上仍存在\n")
					}
					return false
				} else {
					GinkgoWriter.Printf("[DEBUG] 管理集群 BC 资源已删除（引导集群和自身）\n")
					return true
				}
			}, uninstallTimeout, 60*time.Second).Should(BeTrue(), "管理集群应该被完全删除")

			// 步骤 4: 如果等待超时后集群仍未删除，执行强制清理
			existsOnBootstrap := clusterManager.ClusterExistsWithKubeconfig(mgmtClusterName, "")
			existsOnSelf := false
			if mgmtKubeconfigPathForDelete != "" {
				bcName := "bke-cluster"
				namespace := "bke-cluster"
				checkSelfCmd := fmt.Sprintf("KUBECONFIG=%s kubectl get bc %s -n %s --no-headers 2>/dev/null", mgmtKubeconfigPathForDelete, bcName, namespace)
				result, _ := localExecutor.Exec(checkSelfCmd)
				existsOnSelf = result.ExitCode == 0 && strings.TrimSpace(result.Stdout) != ""
			}

			if existsOnBootstrap || existsOnSelf {
				By("超时后管理集群仍存在，在所有管理节点上执行强制重置")
				// 获取管理集群节点（1Master 管理集群使用第一个节点）
				nodes := config.LoadTestNodesFromEnv()
				if len(nodes) >= 1 {
					// 1Master 管理集群只使用第一个节点
					mgmtNodes := []config.NodeInfo{nodes[0]}
					err := clusterManager.ForceResetManagementCluster(mgmtNodes, localExecutor)
					if err != nil {
						GinkgoWriter.Printf("强制重置管理集群失败: %v\n", err)
					} else {
						GinkgoWriter.Printf("在所有节点上完成管理集群强制重置\n")
					}
				} else {
					GinkgoWriter.Printf("未找到管理节点，跳过强制重置\n")
				}
			}

			if mgmtConfigPath != "" {
				if err := clusterManager.CleanupConfig(mgmtConfigPath); err != nil {
					GinkgoWriter.Printf("清理管理集群配置文件失败: %v\n", err)
				}
			}
			if mgmtNodeConfigPath != "" {
				if err := clusterManager.CleanupConfig(mgmtNodeConfigPath); err != nil {
					GinkgoWriter.Printf("Failed to cleanup mgmt node config file: %v\n", err)
				}
			}
			if mgmtKubeconfigPath != "" {
				if _, err := localExecutor.Exec(fmt.Sprintf("rm -f %s", mgmtKubeconfigPath)); err != nil {
					GinkgoWriter.Printf("清理管理集群 kubeconfig 文件失败: %v\n", err)
				}
			}
		})

		// 用例名称：当 worker 节点网络断开时应该成功创建 1Master1Worker 业务集群并保持 Healthy
		// 用例步骤：1) 使用引导集群/管理集群（mgmtKubeconfig）生成并在管理集群上创建业务集群（BC）；2) 在 worker 网络断开场景下等待 BC 达到 Healthy/Ready；3) 校验业务集群最终状态。
		// 预期结果：业务集群 `state` 为 Healthy 且 `clusterStatus` 为 Ready，即使 worker 网络断开也能完成就绪。
		It("当 worker 节点网络断开时应该成功创建 1Master1Worker 业务集群并保持 Healthy", SpecTimeout(InstallationItTimeout), func(ctx SpecContext) {
			By("生成业务集群配置文件")
			var err error
			workloadConfigPath, workloadNodeConfigPath, err = clusterManager.GetConfigGenerator().GenerateAndUpload(workloadClusterConfig)
			Expect(err).NotTo(HaveOccurred(), "应该成功生成业务集群配置文件")

			By("在管理集群上创建业务集群")
			err = clusterManager.CreateClusterInBackgroundWithKubeconfig(workloadConfigPath, workloadNodeConfigPath, mgmtKubeconfigPath)
			if err != nil {
				// 失败时尝试获取更多关于 BC 资源的调试信息
				namespace := ""
				getNsCmd := fmt.Sprintf("yq eval '.metadata.namespace' %s", workloadConfigPath)
				result, _ := sshExecutor.Exec(getNsCmd)
				if result.ExitCode == 0 {
					namespace = strings.TrimSpace(result.Stdout)
					bcName := "bke-" + workloadClusterName
					getBCCmd := fmt.Sprintf("KUBECONFIG=%s kubectl get bc %s -n %s -o yaml 2>&1", mgmtKubeconfigPath, bcName, namespace)
					result, _ := sshExecutor.Exec(getBCCmd)
					GinkgoWriter.Printf("[DEBUG] 应用失败后的 BC 资源状态: exit_code=%d, stdout=%s, stderr=%s\n", result.ExitCode, result.Stdout, result.Stderr)
				}
			}
			Expect(err).NotTo(HaveOccurred(), "业务集群创建命令应该成功执行")

			By("等待业务集群状态变为 Healthy 和 Ready")
			Eventually(func() bool {
				phase, state, clusterStatus, err := clusterManager.GetClusterFullStatusWithKubeconfig(workloadClusterName, mgmtKubeconfigPath)
				if err != nil {
					GinkgoWriter.Printf("获取业务集群完整状态失败: %v, 当前状态: %s\n", err, state)
					return false
				} else {
					GinkgoWriter.Printf("当前业务集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
				}
				if state == "DeployFailed" {
					// 失败时获取完整的 BC 资源和事件用于调试
					bcName := "bke-" + workloadClusterName
					namespace := "bke-" + workloadClusterName
					getBCCmd := fmt.Sprintf("KUBECONFIG=%s kubectl get bc %s -n %s -o yaml 2>&1", mgmtKubeconfigPath, bcName, namespace)
					result, _ := sshExecutor.Exec(getBCCmd)
					if result.ExitCode == 0 {
						GinkgoWriter.Printf("[DEBUG] === DeployFailed 时的 BC 资源 ===\n%s\n", result.Stdout)
					}
					getEventsCmd := fmt.Sprintf("KUBECONFIG=%s kubectl get events -n %s --sort-by='.lastTimestamp' 2>&1 | tail -20", mgmtKubeconfigPath, namespace)
					result, _ = sshExecutor.Exec(getEventsCmd)
					if result.ExitCode == 0 {
						GinkgoWriter.Printf("[DEBUG] === 最近的事件 ===\n%s\n", result.Stdout)
					}
					Fail(fmt.Sprintf("业务集群创建失败，状态为 DeployFailed。Phase: %s, clusterStatus: %s", phase, clusterStatus))
				}
				return state == "Healthy" && clusterStatus == "Ready"
			}, installTimeout, pollInterval).Should(BeTrue(), "即使 worker 节点网络断开，业务集群也应该变为 Healthy 和 Ready 状态")

			By("验证业务集群最终状态")
			phase, state, clusterStatus, err := clusterManager.GetClusterFullStatusWithKubeconfig(workloadClusterName, mgmtKubeconfigPath)
			Expect(err).NotTo(HaveOccurred())
			GinkgoWriter.Printf("最终业务集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
			Expect(state).To(Equal("Healthy"), "当 worker 节点网络断开时，业务集群状态应该是 Healthy")
			Expect(clusterStatus).To(Equal("Ready"), "业务集群状态应该是 Ready")
		})
	})

	Describe("创建 1Master 集群并使用无效的 Master 节点用户名密码", Label("1master", "invalid-credentials", "post-init"), func() {
		var (
			clusterName    string
			clusterConfig  *config.BKEClusterConfig
			configPath     string
			nodeConfigPath string
		)

		BeforeEach(func() {
			clusterName = fmt.Sprintf("test-1m-invalid-%d", time.Now().Unix())

			nodes := config.LoadTestNodesFromEnv()
			Expect(len(nodes)).To(BeNumerically(">=", 1), "1Master 集群需要至少 1 个节点")

			// 第一个节点作为 Master，使用错误的凭据
			masterNode := nodes[0]
			masterNode.Role = []string{"master/node", "etcd"}
			// 设置错误的密码
			masterNode.Password = "invalid-password"

			// 创建带有错误 master 节点凭据的集群配置
			clusterConfig = config.NewDefaultBKEClusterConfig(clusterName, []config.NodeInfo{masterNode})
		})

		AfterEach(func() {
			By("清理测试集群")
			if clusterManager.ClusterExistsWithKubeconfig(clusterName, "") {
				if err := clusterManager.DeleteClusterWithKubeconfig(clusterName, ""); err != nil {
					GinkgoWriter.Printf("删除集群失败: %v\n", err)
				}
				Eventually(func() bool {
					return !clusterManager.ClusterExistsWithKubeconfig(clusterName, "")
				}, uninstallTimeout, 10*time.Second).Should(BeTrue())
			}

			if configPath != "" {
				if err := clusterManager.CleanupConfig(configPath); err != nil {
					GinkgoWriter.Printf("清理配置文件失败: %v\n", err)
				}
			}
			if nodeConfigPath != "" {
				if err := clusterManager.CleanupConfig(nodeConfigPath); err != nil {
					GinkgoWriter.Printf("Failed to cleanup node config file: %v\n", err)
				}
			}
			time.Sleep(60 * time.Second) // 等待ssh的封禁结束，避免对后续测试造成影响
		})

		// 用例名称：使用无效的 master 节点凭据应该无法创建集群
		// 用例步骤：1) 修改 master 节点用户名/密码为无效值并生成错误凭据的集群配置；2) 通过引导节点生成并上传配置并校验配置存在；3) 执行 `bke cluster create`，等待集群状态进入 DeployFailed/InitializationFailed。
		// 预期结果：最终集群状态不为 Healthy，集群创建应失败且状态符合预期。
		It("使用无效的 master 节点凭据应该无法创建集群", SpecTimeout(InstallationItTimeout), func(ctx SpecContext) {
			By("生成带有无效 master 节点凭据的集群配置")
			var err error
			configPath, nodeConfigPath, err = clusterManager.GetConfigGenerator().GenerateAndUpload(clusterConfig)
			Expect(err).NotTo(HaveOccurred(), "应该成功生成配置文件")

			By("验证配置文件已上传")
			err = clusterManager.GetConfigGenerator().ValidateConfigOnNode(configPath)
			Expect(err).NotTo(HaveOccurred(), "配置文件应该存在于引导节点")

			By("执行 bke cluster create 命令")
			err = clusterManager.CreateClusterInBackgroundWithKubeconfig(configPath, nodeConfigPath, "")
			Expect(err).NotTo(HaveOccurred(), "创建集群命令应该成功执行")

			By("等待并验证集群创建失败")
			// 等待集群创建失败，状态为 DeployFailed 和 InitializationFailed
			checkInterval := 30 * time.Second
			checkTimeout := 10 * time.Minute // 检查 10 分钟以验证失败
			Eventually(func() bool {
				phase, state, clusterStatus, _ := clusterManager.GetClusterFullStatusWithKubeconfig(clusterName, "")
				GinkgoWriter.Printf("当前集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)

				if state == "Healthy" {
					// 如果集群变为 Healthy，说明没有被阻止 - 这是意外的
					Fail(fmt.Sprintf("集群意外变为 Healthy。预期集群创建失败，但集群状态为: %s", state))
				}

				// 检查状态是否为 DeployFailed 且 clusterStatus 为 InitializationFailed
				// 由于现在当 clusterStatus 长时间为 InitializationFailed，state也不会变为 DeployFailed，修改退出逻辑
				return state == "DeployFailed" || clusterStatus == "InitializationFailed"
			}, checkTimeout, checkInterval).Should(BeTrue(), "集群创建应该失败，状态为 DeployFailed 和 InitializationFailed")

			// 最终状态检查
			finalState, err := clusterManager.GetClusterStatusWithKubeconfig(clusterName, "")
			Expect(err).NotTo(HaveOccurred())
			Expect(finalState).NotTo(Equal("Healthy"), "当 master 节点凭据无效时，集群不应该是 Healthy")
			GinkgoWriter.Printf("最终集群状态: %s (预期失败，不是 Healthy)\n", finalState)
		})
	})

	Describe("创建 1Master1Worker 集群并使用无效的 Worker 节点用户名密码", Label("1master1worker", "invalid-credentials", "post-init"), func() {
		var (
			clusterName    string
			clusterConfig  *config.BKEClusterConfig
			configPath     string
			nodeConfigPath string
		)

		BeforeEach(func() {
			clusterName = fmt.Sprintf("test-1m1w-invalid-%d", time.Now().Unix())

			nodes := config.LoadTestNodesFromEnv()
			Expect(len(nodes)).To(BeNumerically(">=", 2), "1Master1Worker 集群需要至少 2 个节点")

			// 第一个节点作为 Master
			masterNode := nodes[0]
			masterNode.Role = []string{"master/node", "etcd"}

			// 第二个节点作为 Worker，使用错误的凭据
			workerNode := nodes[1]
			workerNode.Role = []string{"node"}
			// 设置错误的用户名和密码
			workerNode.Username = "invalid-user"
			workerNode.Password = "invalid-password"

			// 创建带有错误 worker 节点凭据的集群配置
			clusterConfig = config.NewDefaultBKEClusterConfig(clusterName, []config.NodeInfo{masterNode, workerNode})
		})

		AfterEach(func() {
			By("清理测试集群")
			if clusterManager.ClusterExistsWithKubeconfig(clusterName, "") {
				if err := clusterManager.DeleteClusterWithKubeconfig(clusterName, ""); err != nil {
					GinkgoWriter.Printf("Failed to delete cluster: %v\n", err)
				}
				Eventually(func() bool {
					return !clusterManager.ClusterExistsWithKubeconfig(clusterName, "")
				}, uninstallTimeout, 10*time.Second).Should(BeTrue())
			}
			if configPath != "" {
				if err := clusterManager.CleanupConfig(configPath); err != nil {
					GinkgoWriter.Printf("清理配置文件失败: %v\n", err)
				}
			}
			if nodeConfigPath != "" {
				if err := clusterManager.CleanupConfig(nodeConfigPath); err != nil {
					GinkgoWriter.Printf("Failed to cleanup node config file: %v\n", err)
				}
			}
		})

		// 用例名称：即使使用无效的 worker 节点凭据也应该成功创建集群
		// 用例步骤：1) 在集群配置中将 worker 节点用户名/密码设置为无效值；2) 通过引导节点生成并上传配置并校验配置存在；3) 执行 `bke cluster create`，等待集群达到 Healthy/Ready；4) 校验最终集群状态。
		// 预期结果：即使 worker 凭据无效，集群仍能完成创建并保持 `state=Healthy`、`clusterStatus=Ready`。
		It("即使使用无效的 worker 节点凭据也应该成功创建集群", SpecTimeout(InstallationItTimeout), func(ctx SpecContext) {
			By("生成带有无效 worker 节点凭据的集群配置")
			var err error
			configPath, nodeConfigPath, err = clusterManager.GetConfigGenerator().GenerateAndUpload(clusterConfig)
			Expect(err).NotTo(HaveOccurred(), "应该成功生成配置文件")

			By("验证配置文件已上传")
			err = clusterManager.GetConfigGenerator().ValidateConfigOnNode(configPath)
			Expect(err).NotTo(HaveOccurred(), "配置文件应该存在于引导节点")

			By("执行 bke cluster create 命令")
			err = clusterManager.CreateClusterInBackgroundWithKubeconfig(configPath, nodeConfigPath, "")
			Expect(err).NotTo(HaveOccurred(), "创建集群命令应该成功执行")

			By("等待集群状态变为 Healthy 和 Ready")
			Eventually(func() bool {
				phase, state, clusterStatus, _ := clusterManager.GetClusterFullStatusWithKubeconfig(clusterName, "")
				GinkgoWriter.Printf("当前集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
				if state == "DeployFailed" {
					Fail(fmt.Sprintf("集群创建失败，状态为 DeployFailed。Phase: %s, clusterStatus: %s", phase, clusterStatus))
				}
				return state == "Healthy" && clusterStatus == "Ready"
			}, installTimeout, pollInterval).Should(BeTrue(), "集群应该变为 Healthy 和 Ready 状态")

			By("验证集群最终状态")
			phase, state, clusterStatus, err := clusterManager.GetClusterFullStatusWithKubeconfig(clusterName, "")
			Expect(err).NotTo(HaveOccurred())
			GinkgoWriter.Printf("最终集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
			Expect(state).To(Equal("Healthy"), "集群状态应该是 Healthy")
			Expect(clusterStatus).To(Equal("Ready"), "集群状态应该是 Ready")
		})
	})

	Describe("创建 1Master 管理集群和 1Master1Worker 业务集群并使用无效的 Worker 节点用户名密码", Label("1m-mgmt-1m1w-workload", "invalid-credentials", "post-init"), Ordered, func() {
		var (
			mgmtClusterName        string
			mgmtClusterConfig      *config.BKEClusterConfig
			mgmtConfigPath         string
			mgmtNodeConfigPath     string
			mgmtKubeconfigPath     string
			workloadClusterName    string
			workloadClusterConfig  *config.BKEClusterConfig
			workloadConfigPath     string
			workloadNodeConfigPath string
		)

		BeforeAll(func() {
			By("在引导集群上创建 1Master 管理集群")
			mgmtClusterName = fmt.Sprintf("test-mgmt-1m-%d", time.Now().Unix())

			nodes := config.LoadTestNodesFromEnv()
			Expect(len(nodes)).To(BeNumerically(">=", 1), "1Master 管理集群需要至少 1 个节点")

			// 第一个节点作为 Master
			masterNode := nodes[0]
			masterNode.Role = []string{"master/node", "etcd"}

			// 创建管理集群配置（管理集群需要 cluster-api 来管理业务集群）
			// 包含 cluster-api 但排除 openfuyao-system-controller（与证书测试用例相同）
			mgmtClusterConfig = config.NewBKEClusterConfigForMgmt(mgmtClusterName, []config.NodeInfo{masterNode})

			By("生成管理集群配置文件")
			var err error
			mgmtConfigPath, mgmtNodeConfigPath, err = clusterManager.GetConfigGenerator().GenerateAndUpload(mgmtClusterConfig)
			Expect(err).NotTo(HaveOccurred(), "应该成功生成管理集群配置文件")

			By("执行管理集群的 bke cluster create 命令")
			err = clusterManager.CreateClusterInBackgroundWithKubeconfig(mgmtConfigPath, mgmtNodeConfigPath, "")
			Expect(err).NotTo(HaveOccurred(), "管理集群创建命令应该成功执行")

			By("等待管理集群状态变为 Healthy 和 Ready")
			Eventually(func() bool {
				phase, state, clusterStatus, _ := clusterManager.GetClusterFullStatusWithKubeconfig(mgmtClusterName, "")
				GinkgoWriter.Printf("当前管理集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
				if state == "DeployFailed" {
					Fail(fmt.Sprintf("管理集群创建失败，状态为 DeployFailed。Phase: %s, clusterStatus: %s", phase, clusterStatus))
				}
				return state == "Healthy" && clusterStatus == "Ready"
			}, mgmtInstallTimeout, pollInterval).Should(BeTrue(), "管理集群应该变为 Healthy 和 Ready 状态")

			By("验证管理集群最终状态")
			phase, state, clusterStatus, err := clusterManager.GetClusterFullStatusWithKubeconfig(mgmtClusterName, "")
			Expect(err).NotTo(HaveOccurred())
			GinkgoWriter.Printf("最终管理集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
			Expect(state).To(Equal("Healthy"), "管理集群状态应该是 Healthy")
			Expect(clusterStatus).To(Equal("Ready"), "管理集群状态应该是 Ready")

			By("获取管理集群 kubeconfig")
			mgmtChecker := utils.NewClusterCheckerWithParentKubeconfig(sshExecutor, mgmtClusterName, "")
			mgmtKubeconfigPath, err = mgmtChecker.SaveKubeconfigToFileWithKubeconfig("")
			Expect(err).NotTo(HaveOccurred(), "应该成功获取管理集群 kubeconfig")
			GinkgoWriter.Printf("管理集群 kubeconfig 保存到: %s\n", mgmtKubeconfigPath)
		})

		AfterAll(func() {
			By("清理业务集群")
			if workloadClusterName != "" && mgmtKubeconfigPath != "" {
				if clusterManager.ClusterExistsWithKubeconfig(workloadClusterName, mgmtKubeconfigPath) {
					if err := clusterManager.DeleteClusterWithKubeconfig(workloadClusterName, mgmtKubeconfigPath); err != nil {
						GinkgoWriter.Printf("删除业务集群失败: %v\n", err)
					}
					Eventually(func() bool {
						return !clusterManager.ClusterExistsWithKubeconfig(workloadClusterName, mgmtKubeconfigPath)
					}, uninstallTimeout, 10*time.Second).Should(BeTrue())
				}
				if workloadConfigPath != "" {
					if err := clusterManager.CleanupConfig(workloadConfigPath); err != nil {
						GinkgoWriter.Printf("清理业务集群配置文件失败: %v\n", err)
					}
				}
				if workloadNodeConfigPath != "" {
					if err := clusterManager.CleanupConfig(workloadNodeConfigPath); err != nil {
						GinkgoWriter.Printf("Failed to cleanup workload node config file: %v\n", err)
					}
				}
			}

			By("清理管理集群")
			// 首先获取管理集群 kubeconfig（删除自身 BC 需要）
			mgmtChecker := utils.NewClusterCheckerWithParentKubeconfig(localExecutor, mgmtClusterName, "")
			mgmtKubeconfigPathForDelete, err := mgmtChecker.SaveKubeconfigToFile()
			if err != nil {
				GinkgoWriter.Printf("[DEBUG] 获取管理集群 kubeconfig 失败: %v\n", err)
				// 使用 BeforeAll 中的 mgmtKubeconfigPath 作为后备
				if mgmtKubeconfigPath != "" {
					mgmtKubeconfigPathForDelete = mgmtKubeconfigPath
				}
			} else if mgmtKubeconfigPath != "" {
				// 清理刚刚创建的临时 kubeconfig
				defer localExecutor.Exec(fmt.Sprintf("rm -f %s", mgmtKubeconfigPathForDelete))
				mgmtKubeconfigPathForDelete = mgmtKubeconfigPath
			}

			// 步骤 1: 在引导集群上删除 BC 资源（管理集群创建的地方）
			if clusterManager.ClusterExistsWithKubeconfig(mgmtClusterName, "") {
				By("在引导集群上触发管理集群删除")
				err := clusterManager.DeleteClusterWithKubeconfig(mgmtClusterName, "")
				if err != nil {
					GinkgoWriter.Printf("在引导集群上触发管理集群删除失败: %v\n", err)
				}
			} else {
				GinkgoWriter.Printf("[DEBUG] 引导集群上不存在管理集群 BC\n")
			}

			// 步骤 2: 在管理集群本身上删除 BC 资源（固定名称: bke-cluster）
			if mgmtKubeconfigPathForDelete != "" {
				By("在管理集群上触发管理集群自身 BC 删除")
				err := clusterManager.DeleteManagementClusterSelfBC(mgmtKubeconfigPathForDelete)
				if err != nil {
					GinkgoWriter.Printf("触发管理集群自身 BC 删除失败: %v\n", err)
				}
			}

			// 步骤 3: 等待两个 BC 资源都被删除
			By("等待管理集群完全删除")
			Eventually(func() bool {
				// 检查引导集群上的 BC
				existsOnBootstrap := clusterManager.ClusterExistsWithKubeconfig(mgmtClusterName, "")

				// 检查管理集群本身的 BC
				existsOnSelf := false
				if mgmtKubeconfigPathForDelete != "" {
					// 管理集群自身 BC 的固定名称为 "bke-cluster"，命名空间为 "bke-cluster"
					bcName := "bke-cluster"
					namespace := "bke-cluster"
					checkSelfCmd := fmt.Sprintf("KUBECONFIG=%s kubectl get bc %s -n %s --no-headers 2>/dev/null", mgmtKubeconfigPathForDelete, bcName, namespace)
					result, _ := localExecutor.Exec(checkSelfCmd)
					existsOnSelf = result.ExitCode == 0 && strings.TrimSpace(result.Stdout) != ""

					if !existsOnSelf {
						// BC 不存在时记录调试信息
						GinkgoWriter.Printf("[DEBUG] 管理集群自身 BC (bke-cluster) 未找到: exit_code=%d\n", result.ExitCode)
					}
				}

				if existsOnBootstrap || existsOnSelf {
					if existsOnBootstrap {
						phase, state, clusterStatus, _ := clusterManager.GetClusterFullStatusWithKubeconfig(mgmtClusterName, "")
						GinkgoWriter.Printf("[DEBUG] 引导集群上的管理集群 BC 仍存在: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
					}
					if existsOnSelf {
						GinkgoWriter.Printf("[DEBUG] 管理集群自身 BC (bke-cluster) 在管理集群上仍存在\n")
					}
					return false
				} else {
					GinkgoWriter.Printf("[DEBUG] 管理集群 BC 资源已删除（引导集群和自身）\n")
					return true
				}
			}, uninstallTimeout, 60*time.Second).Should(BeTrue(), "管理集群应该被完全删除")

			// 步骤 4: 如果等待超时后集群仍未删除，执行强制清理
			existsOnBootstrap := clusterManager.ClusterExistsWithKubeconfig(mgmtClusterName, "")
			existsOnSelf := false
			if mgmtKubeconfigPathForDelete != "" {
				bcName := "bke-cluster"
				namespace := "bke-cluster"
				checkSelfCmd := fmt.Sprintf("KUBECONFIG=%s kubectl get bc %s -n %s --no-headers 2>/dev/null", mgmtKubeconfigPathForDelete, bcName, namespace)
				result, _ := localExecutor.Exec(checkSelfCmd)
				existsOnSelf = result.ExitCode == 0 && strings.TrimSpace(result.Stdout) != ""
			}

			if existsOnBootstrap || existsOnSelf {
				By("超时后管理集群仍存在，在所有管理节点上执行强制重置")
				// 获取管理集群节点（1Master 管理集群使用第一个节点）
				nodes := config.LoadTestNodesFromEnv()
				if len(nodes) >= 1 {
					// 1Master 管理集群只使用第一个节点
					mgmtNodes := []config.NodeInfo{nodes[0]}
					err := clusterManager.ForceResetManagementCluster(mgmtNodes, localExecutor)
					if err != nil {
						GinkgoWriter.Printf("强制重置管理集群失败: %v\n", err)
					} else {
						GinkgoWriter.Printf("在所有节点上完成管理集群强制重置\n")
					}
				} else {
					GinkgoWriter.Printf("未找到管理节点，跳过强制重置\n")
				}
			}

			if mgmtConfigPath != "" {
				if err := clusterManager.CleanupConfig(mgmtConfigPath); err != nil {
					GinkgoWriter.Printf("清理管理集群配置文件失败: %v\n", err)
				}
			}
			if mgmtNodeConfigPath != "" {
				if err := clusterManager.CleanupConfig(mgmtNodeConfigPath); err != nil {
					GinkgoWriter.Printf("Failed to cleanup mgmt node config file: %v\n", err)
				}
			}
			if mgmtKubeconfigPath != "" {
				if err := clusterManager.CleanupConfig(mgmtKubeconfigPath); err != nil {
					GinkgoWriter.Printf("清理管理集群 kubeconfig 文件失败: %v\n", err)
				}
			}
		})

		// 用例名称：在管理集群上使用无效的 worker 节点凭据应该成功创建 1Master1Worker 业务集群
		// 用例步骤：1) 在业务集群配置中把 worker 凭据设置为无效；2) 生成并在引导节点侧完成配置上传/落地；3) 使用管理集群的 kubeconfig 在管理集群上创建 BC；4) 等待业务集群达到 Healthy/Ready，并验证最终状态。
		// 预期结果：即使业务集群 worker 使用无效凭据，业务集群也应仍能完成初始化并保持 `state=Healthy`、`clusterStatus=Ready`。
		It("在管理集群上使用无效的 worker 节点凭据应该成功创建 1Master1Worker 业务集群", SpecTimeout(InstallationItTimeout), func(ctx SpecContext) {
			By("创建带有无效 worker 节点凭据的 1Master1Worker 业务集群配置")
			workloadClusterName = fmt.Sprintf("test-workload-1m1w-%d", time.Now().Unix())

			nodes := config.LoadTestNodesFromEnv()
			// 管理集群使用 nodes[0]，所以业务集群需要至少 nodes[1] 和 nodes[2]
			Expect(len(nodes)).To(BeNumerically(">=", 3), "1Master1Worker 业务集群需要至少 3 个节点（管理集群使用节点 1，业务集群需要节点 2-3）")

			// 使用节点 2 作为 Master (nodes[1])
			masterNode := nodes[1]
			masterNode.Role = []string{"master/node", "etcd"}

			// 使用节点 3 作为 Worker，使用错误的凭据 (nodes[2])
			workerNode := nodes[2]
			workerNode.Role = []string{"node"}
			// 设置错误的用户名和密码
			workerNode.Username = "invalid-user"
			workerNode.Password = "invalid-password"

			// 创建业务集群配置
			workloadClusterConfig = config.NewDefaultBKEClusterConfig(workloadClusterName, []config.NodeInfo{masterNode, workerNode})

			By("生成业务集群配置文件")
			var err error
			workloadConfigPath, workloadNodeConfigPath, err = clusterManager.GetConfigGenerator().GenerateAndUpload(workloadClusterConfig)
			Expect(err).NotTo(HaveOccurred(), "应该成功生成业务集群配置文件")

			By("在管理集群上创建业务集群")
			err = clusterManager.CreateClusterInBackgroundWithKubeconfig(workloadConfigPath, workloadNodeConfigPath, mgmtKubeconfigPath)
			Expect(err).NotTo(HaveOccurred(), "业务集群创建命令应该成功执行")

			By("等待业务集群状态变为 Healthy 和 Ready")
			Eventually(func() bool {
				phase, state, clusterStatus, _ := clusterManager.GetClusterFullStatusWithKubeconfig(workloadClusterName, mgmtKubeconfigPath)
				GinkgoWriter.Printf("当前业务集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
				if state == "DeployFailed" {
					Fail(fmt.Sprintf("业务集群创建失败，状态为 DeployFailed。Phase: %s, clusterStatus: %s", phase, clusterStatus))
				}
				return state == "Healthy" && clusterStatus == "Ready"
			}, installTimeout, pollInterval).Should(BeTrue(), "业务集群应该变为 Healthy 和 Ready 状态")

			By("验证业务集群最终状态")
			phase, state, clusterStatus, err := clusterManager.GetClusterFullStatusWithKubeconfig(workloadClusterName, mgmtKubeconfigPath)
			Expect(err).NotTo(HaveOccurred())
			GinkgoWriter.Printf("最终业务集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
			Expect(state).To(Equal("Healthy"), "业务集群状态应该是 Healthy")
			Expect(clusterStatus).To(Equal("Ready"), "业务集群状态应该是 Ready")
		})
	})

	Describe("创建 3Master1Worker 管理集群和业务集群并使用自定义证书签名", Label("3m1w-custom", "certificate", "skip-temporarily"), Ordered, func() {
		var (
			mgmtCluster     *Cluster3M1WConfig
			workloadCluster *Cluster3M1WConfig
			certGenerator   *utils.CertificateConfigGenerator
		)

		BeforeAll(func() {
			nodes := config.LoadTestNodesFromEnv()
			if len(nodes) < 8 {
				Skip(fmt.Sprintf("需要至少 8 个节点（管理集群 4 个 + 业务集群 4 个），当前 %d 个，跳过", len(nodes)))
			}

			// 步骤 1: 在引导节点上生成证书链文件
			By("在引导节点上生成证书链文件")
			err := utils.GenerateCertificateChainOnRemote(sshExecutor, "/etc/openFuyao/certs")
			Expect(err).NotTo(HaveOccurred(), "应该成功生成证书链文件")
			GinkgoWriter.Printf("证书链文件在引导节点上生成于: %s\n", "/etc/openFuyao/certs")

			// 步骤 2: 在引导节点上生成证书配置
			By("在引导节点上生成证书配置")
			certConfigDir := "/etc/openFuyao/certs/cert_config"
			certGenerator = utils.NewCertificateConfigGenerator(certConfigDir)
			err = certGenerator.GenerateAllCertificateConfigs()
			Expect(err).NotTo(HaveOccurred(), "应该成功生成证书配置")
			GinkgoWriter.Printf("证书配置在引导节点上生成于: %s\n", certConfigDir)

			// 步骤 3: 使用辅助函数创建管理集群
			var err2 error
			mgmtCluster, err2 = Create3M1WManagementCluster(
				clusterManager,
				localExecutor,
				"test-mgmt-3m1w-cert",
				true, // includeClusterAPI
			)
			Expect(err2).NotTo(HaveOccurred(), "应该成功创建管理集群")

			// 步骤 4: 使用辅助函数创建业务集群
			workloadCluster, err2 = Create3M1WWorkloadCluster(
				clusterManager,
				localExecutor,
				mgmtCluster.KubeconfigPath,
				"test-workload-3m1w-cert",
			)
			Expect(err2).NotTo(HaveOccurred(), "应该成功创建业务集群")
		})

		// 用例名称：应该验证所有 master 节点的证书签名
		// 用例步骤：1) 使用引导节点生成证书链与证书配置（BeforeAll）；2) 依次对 3master 管理集群的 master 节点与业务集群 master 节点执行证书链签名校验；3) 校验每个验证项均 success。
		// 预期结果：所有 master 节点证书签名验证项全部通过，`verifier.VerifyCertificateChain()` 的每个 item 均为成功。
		It("应该验证所有 master 节点的证书签名", SpecTimeout(InstallationItTimeout), func(ctx SpecContext) {
			nodes := config.LoadTestNodesFromEnv()
			if len(nodes) < 8 {
				Skip(fmt.Sprintf("需要至少 8 个节点（管理集群 4 个 + 业务集群 4 个），当前 %d 个，跳过", len(nodes)))
			}

			// 验证管理集群的所有 master 节点（节点 1-3，索引 0-2）
			By("验证管理集群所有 master 节点的证书签名")
			for i := 0; i < 3; i++ {
				masterNode := nodes[i]
				By(fmt.Sprintf("验证管理集群 master 节点 %d (%s) 的证书签名", i+1, masterNode.IP))
				verifier := utils.NewCertificateVerifierWithSSH("/etc/kubernetes/pki", localExecutor, masterNode.IP, masterNode.Username, masterNode.Password)
				results, err := verifier.VerifyCertificateChain()
				Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("应该成功验证管理集群 master 节点 %d 的证书链", i+1))
				GinkgoWriter.Printf("=== 管理集群 Master 节点 %d (%s) 证书验证结果 ===\n", i+1, masterNode.IP)
				verifier.PrintResults(results)
				// 检查所有验证项是否成功
				for _, item := range results.Items {
					if !item.Success {
						GinkgoWriter.Printf("管理集群 master 节点 %d 证书验证失败: %s - %v\n", i+1, item.Name, item.Error)
					}
					Expect(item.Success).To(BeTrue(), fmt.Sprintf("管理集群 master 节点 %d 证书验证应该成功: %s", i+1, item.Name))
				}
			}

			// 验证业务集群的所有 master 节点（节点 5-7，索引 4-6）
			By("验证业务集群所有 master 节点的证书签名")
			for i := 4; i < 7; i++ {
				masterNode := nodes[i]
				By(fmt.Sprintf("验证业务集群 master 节点 %d (%s) 的证书签名", i+1, masterNode.IP))
				verifier := utils.NewCertificateVerifierWithSSH("/etc/kubernetes/pki", localExecutor, masterNode.IP, masterNode.Username, masterNode.Password)
				results, err := verifier.VerifyCertificateChain()
				Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("应该成功验证业务集群 master 节点 %d 的证书链", i+1))
				GinkgoWriter.Printf("=== 业务集群 Master 节点 %d (%s) 证书验证结果 ===\n", i+1, masterNode.IP)
				verifier.PrintResults(results)
				// 检查所有验证项是否成功
				for _, item := range results.Items {
					if !item.Success {
						GinkgoWriter.Printf("业务集群 master 节点 %d 证书验证失败: %s - %v\n", i+1, item.Name, item.Error)
					}
					Expect(item.Success).To(BeTrue(), fmt.Sprintf("业务集群 master 节点 %d 证书验证应该成功: %s", i+1, item.Name))
				}
			}
		})

		AfterAll(func() {
			// 清理业务集群
			if workloadCluster != nil && mgmtCluster != nil {
				Delete3M1WWorkloadCluster(
					clusterManager,
					localExecutor,
					workloadCluster.ClusterName,
					mgmtCluster.KubeconfigPath,
				)
				// 清理配置文件
				if workloadCluster.ConfigPath != "" {
					if err := clusterManager.CleanupConfig(workloadCluster.ConfigPath); err != nil {
						GinkgoWriter.Printf("清理业务集群配置文件失败: %v\n", err)
					}
				}
				if workloadCluster.NodeConfigPath != "" {
					if err := clusterManager.CleanupConfig(workloadCluster.NodeConfigPath); err != nil {
						GinkgoWriter.Printf("Failed to cleanup workload node config file: %v\n", err)
					}
				}
			}

			// 清理管理集群
			if mgmtCluster != nil {
				Delete3M1WManagementCluster(
					clusterManager,
					localExecutor,
					mgmtCluster.ClusterName,
					mgmtCluster.KubeconfigPath,
				)
				// 清理配置文件
				if mgmtCluster.ConfigPath != "" {
					if err := clusterManager.CleanupConfig(mgmtCluster.ConfigPath); err != nil {
						GinkgoWriter.Printf("清理管理集群配置文件失败: %v\n", err)
					}
				}
				if mgmtCluster.NodeConfigPath != "" {
					if err := clusterManager.CleanupConfig(mgmtCluster.NodeConfigPath); err != nil {
						GinkgoWriter.Printf("Failed to cleanup mgmt node config file: %v\n", err)
					}
				}
				// 清理 kubeconfig 文件
				if mgmtCluster.KubeconfigPath != "" {
					if err := clusterManager.CleanupConfig(mgmtCluster.KubeconfigPath); err != nil {
						GinkgoWriter.Printf("清理管理集群 kubeconfig 文件失败: %v\n", err)
					}
				}
			}

			// 清理证书目录
			By("从引导节点清理证书目录")
			if err := clusterManager.CleanupPathRecursive("/etc/openFuyao/certs"); err != nil {
				GinkgoWriter.Printf("清理证书目录失败: %v\n", err)
			}
		})
	})

	Describe("创建 3Master1Worker 管理集群和业务集群", Label("3m1w-no-custom", "certificate", "skip-temporarily"), Ordered, func() {
		var (
			mgmtCluster     *Cluster3M1WConfig
			workloadCluster *Cluster3M1WConfig
		)

		BeforeAll(func() {
			nodes := config.LoadTestNodesFromEnv()
			if len(nodes) < 8 {
				Skip(fmt.Sprintf("需要至少 8 个节点（管理集群 4 个 + 业务集群 4 个），当前 %d 个，跳过", len(nodes)))
			}

			// 步骤 1: 使用辅助函数创建管理集群
			By("创建 3Master1Worker 管理集群")
			var err error
			mgmtCluster, err = Create3M1WManagementCluster(
				clusterManager,
				localExecutor,
				"test-mgmt-3m1w",
				true, // includeClusterAPI
			)
			Expect(err).NotTo(HaveOccurred(), "应该成功创建管理集群")

			// 步骤 2: 使用辅助函数创建业务集群
			By("创建 3Master1Worker 业务集群")
			workloadCluster, err = Create3M1WWorkloadCluster(
				clusterManager,
				localExecutor,
				mgmtCluster.KubeconfigPath,
				"test-workload-3m1w-basic",
			)
			Expect(err).NotTo(HaveOccurred(), "应该成功创建业务集群")
		})

		// 用例名称：应该验证管理集群和业务集群是健康的
		// 用例步骤：1) 在 3Master1Worker（3master 管理集群）场景下初始化完成后；2) 分别检查管理集群与业务集群的 `state/phase/clusterStatus`；3) 等待直到两者均返回 Healthy。
		// 预期结果：管理集群与业务集群都处于 Healthy 状态，说明集群管理链路与业务集群就绪正常。
		It("应该验证管理集群和业务集群是健康的", SpecTimeout(InstallationItTimeout), func(ctx SpecContext) {
			By("验证管理集群健康状态")
			Eventually(func() string {
				phase, state, clusterStatus, _ := clusterManager.GetClusterFullStatus(mgmtCluster.ClusterName)
				GinkgoWriter.Printf("当前集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
				failOnClusterFailure(state, clusterStatus)
				return state
			}, mgmtInstallTimeout, pollInterval).Should(Equal("Healthy"))

			By("验证业务集群健康状态")
			Eventually(func() string {
				phase, state, clusterStatus, _ := clusterManager.GetClusterFullStatusWithKubeconfig(workloadCluster.ClusterName, mgmtCluster.KubeconfigPath)
				GinkgoWriter.Printf("当前集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
				failOnClusterFailure(state, clusterStatus)
				return state
			}, installTimeout, pollInterval).Should(Equal("Healthy"))

			GinkgoWriter.Printf("=== 管理集群和业务集群都是健康的 ===\n")
			GinkgoWriter.Printf("管理集群: %s\n", mgmtCluster.ClusterName)
			GinkgoWriter.Printf("业务集群: %s\n", workloadCluster.ClusterName)
		})

		// 用例名称：应该验证所有 master 节点的 kubeconfig 挂载
		// 用例步骤：1) 获取测试节点列表（覆盖 3master 管理集群与业务集群 master）；2) 针对每个 master 节点通过 SSH 校验 `/etc/kubernetes/pki` 下 kubeconfig/挂载情况；3) 汇总并断言所有校验项 success。
		// 预期结果：所有 master 节点（管理集群与业务集群）均完成 kubeconfig 挂载校验通过。
		It("应该验证所有 master 节点的 kubeconfig 挂载", SpecTimeout(InstallationItTimeout), func(ctx SpecContext) {
			nodes := config.LoadTestNodesFromEnv()
			if len(nodes) < 8 {
				Skip(fmt.Sprintf("需要至少 8 个节点（管理集群 4 个 + 业务集群 4 个），当前 %d 个，跳过", len(nodes)))
			}

			// 验证管理集群的所有 master 节点（节点 1-3，索引 0-2）
			By("验证管理集群所有 master 节点的 kubeconfig 挂载")
			for i := 0; i < 3; i++ {
				masterNode := nodes[i]
				By(fmt.Sprintf("验证管理集群 master 节点 %d (%s) 的 kubeconfig 挂载", i+1, masterNode.IP))
				verifier := utils.NewCertificateVerifierWithSSH("/etc/kubernetes/pki", localExecutor, masterNode.IP, masterNode.Username, masterNode.Password)
				results, err := verifier.VerifyKubeconfigMount()
				Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("应该成功验证管理集群 master 节点 %d 的 kubeconfig 挂载", i+1))
				GinkgoWriter.Printf("=== 管理集群 Master 节点 %d (%s) Kubeconfig 挂载验证结果 ===\n", i+1, masterNode.IP)
				verifier.PrintResults(results)
				// 检查所有验证项是否成功
				for _, item := range results.Items {
					if !item.Success {
						GinkgoWriter.Printf("管理集群 master 节点 %d kubeconfig 挂载验证失败: %s - %v\n", i+1, item.Name, item.Error)
					}
					Expect(item.Success).To(BeTrue(), fmt.Sprintf("管理集群 master 节点 %d kubeconfig 挂载验证应该成功: %s", i+1, item.Name))
				}
			}

			// 验证业务集群的所有 master 节点（节点 5-7，索引 4-6）
			By("验证业务集群所有 master 节点的 kubeconfig 挂载")
			for i := 4; i < 7; i++ {
				masterNode := nodes[i]
				By(fmt.Sprintf("验证业务集群 master 节点 %d (%s) 的 kubeconfig 挂载", i+1, masterNode.IP))
				verifier := utils.NewCertificateVerifierWithSSH("/etc/kubernetes/pki", localExecutor, masterNode.IP, masterNode.Username, masterNode.Password)
				results, err := verifier.VerifyKubeconfigMount()
				Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("应该成功验证业务集群 master 节点 %d 的 kubeconfig 挂载", i+1))
				GinkgoWriter.Printf("=== 业务集群 Master 节点 %d (%s) Kubeconfig 挂载验证结果 ===\n", i+1, masterNode.IP)
				verifier.PrintResults(results)
				// 检查所有验证项是否成功
				for _, item := range results.Items {
					if !item.Success {
						GinkgoWriter.Printf("业务集群 master 节点 %d kubeconfig 挂载验证失败: %s - %v\n", i+1, item.Name, item.Error)
					}
					Expect(item.Success).To(BeTrue(), fmt.Sprintf("业务集群 master 节点 %d kubeconfig 挂载验证应该成功: %s", i+1, item.Name))
				}
			}
		})

		AfterAll(func() {
			// 清理业务集群
			if workloadCluster != nil && mgmtCluster != nil {
				Delete3M1WWorkloadCluster(
					clusterManager,
					localExecutor,
					workloadCluster.ClusterName,
					mgmtCluster.KubeconfigPath,
				)
				// 清理配置文件
				if workloadCluster.ConfigPath != "" {
					if err := clusterManager.CleanupConfig(workloadCluster.ConfigPath); err != nil {
						GinkgoWriter.Printf("清理业务集群配置文件失败: %v\n", err)
					}
				}
				if workloadCluster.NodeConfigPath != "" {
					if err := clusterManager.CleanupConfig(workloadCluster.NodeConfigPath); err != nil {
						GinkgoWriter.Printf("Failed to cleanup workload node config file: %v\n", err)
					}
				}
			}

			// 清理管理集群
			if mgmtCluster != nil {
				Delete3M1WManagementCluster(
					clusterManager,
					localExecutor,
					mgmtCluster.ClusterName,
					mgmtCluster.KubeconfigPath,
				)
				// 清理配置文件
				if mgmtCluster.ConfigPath != "" {
					if err := clusterManager.CleanupConfig(mgmtCluster.ConfigPath); err != nil {
						GinkgoWriter.Printf("清理管理集群配置文件失败: %v\n", err)
					}
				}
				if mgmtCluster.NodeConfigPath != "" {
					if err := clusterManager.CleanupConfig(mgmtCluster.NodeConfigPath); err != nil {
						GinkgoWriter.Printf("Failed to cleanup mgmt node config file: %v\n", err)
					}
				}
				// 清理 kubeconfig 文件
				if mgmtCluster.KubeconfigPath != "" {
					if err := clusterManager.CleanupConfig(mgmtCluster.KubeconfigPath); err != nil {
						GinkgoWriter.Printf("清理管理集群 kubeconfig 文件失败: %v\n", err)
					}
				}
			}
		})
	})

	Describe("节点标签功能", Label("node-labels", "post-init"), func() {
		var (
			clusterName    string
			clusterConfig  *config.BKEClusterConfig
			configPath     string
			nodeConfigPath string
		)

		BeforeEach(func() {
			clusterName = fmt.Sprintf("test-labels-%d", time.Now().Unix())

			// 加载节点配置
			nodes := config.LoadTestNodesFromEnv()
			Expect(len(nodes)).To(BeNumerically(">=", 2), "节点标签亲和性测试至少需要 2 个节点")

			// 第一个节点作为 Master
			masterNode := nodes[0]
			masterNode.Role = []string{"master/node", "etcd"}
			masterNode.Labels = []config.NodeLabel{
				{Key: "test-label", Value: "e2e-test"},
				{Key: "environment", Value: "testing"},
			}

			// 第二个节点作为 Worker
			workerNode := nodes[1]
			workerNode.Role = []string{"node"}
			workerNode.Labels = []config.NodeLabel{
				{Key: "addon-node", Value: "redis"},
			}

			// 创建集群配置
			clusterConfig = config.NewDefaultBKEClusterConfig(clusterName, []config.NodeInfo{masterNode, workerNode})
		})

		AfterEach(func() {
			By("清理测试集群")
			if clusterManager.ClusterExistsWithKubeconfig(clusterName, "") {
				if err := clusterManager.DeleteClusterWithKubeconfig(clusterName, ""); err != nil {
					GinkgoWriter.Printf("Failed to delete cluster: %v\n", err)
				}
				Eventually(func() bool {
					return !clusterManager.ClusterExistsWithKubeconfig(clusterName, "")
				}, uninstallTimeout, 10*time.Second).Should(BeTrue())
			}
			if configPath != "" {
				if err := clusterManager.CleanupConfig(configPath); err != nil {
					GinkgoWriter.Printf("Failed to cleanup config file: %v\n", err)
				}
			}
			if nodeConfigPath != "" {
				if err := clusterManager.CleanupConfig(nodeConfigPath); err != nil {
					GinkgoWriter.Printf("Failed to cleanup node config file: %v\n", err)
				}
			}
		})

		// 用例名称：应该成功创建集群并验证节点标签
		// 用例步骤：1) 在引导节点侧生成并上传带有 master/worker 节点 labels 的集群配置；2) 执行 `bke cluster create` 创建集群；3) 等待集群 Healthy；4) 通过集群检查器验证 master/worker 节点标签是否生效。
		// 预期结果：集群创建成功，master 节点包含 `test-label`/`environment`，worker 节点包含 `addon-node=redis`。
		It("应该成功创建集群并验证节点标签", Label("node-labels-basic"), SpecTimeout(InstallationItTimeout), func(ctx SpecContext) {
			By("生成集群配置文件")
			var err error
			configPath, nodeConfigPath, err = clusterManager.GetConfigGenerator().GenerateAndUpload(clusterConfig)
			Expect(err).NotTo(HaveOccurred(), "应该成功生成配置文件")

			By("执行集群创建命令")
			err = clusterManager.CreateClusterInBackgroundWithKubeconfig(configPath, nodeConfigPath, "")
			Expect(err).NotTo(HaveOccurred(), "创建集群命令应该成功执行")

			By("等待集群状态变为 Healthy")
			Eventually(func() string {
				phase, state, clusterStatus, err := clusterManager.GetClusterFullStatusWithKubeconfig(clusterName, "")
				if err != nil {
					GinkgoWriter.Printf("获取集群状态失败: %v\n", err)
					return ""
				}
				GinkgoWriter.Printf("当前集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
				failOnClusterFailure(state, clusterStatus)
				return state
			}, installTimeout, pollInterval).Should(Equal("Healthy"), "集群应该变为 Healthy 状态")

			By("验证节点标签")
			checker := utils.NewClusterCheckerWithParentKubeconfig(sshExecutor, clusterName, "")

			// 检查 Master 节点的标签
			Eventually(func() bool {
				hasLabel, _ := checker.CheckNodeHasLabel("test-label", "e2e-test")
				return hasLabel
			}, 2*time.Minute, 10*time.Second).Should(BeTrue(), "Master 节点应该有 test-label 标签")

			// 检查 Worker 节点的标签
			Eventually(func() bool {
				hasLabel, _ := checker.CheckNodeHasLabel("addon-node", "redis")
				return hasLabel
			}, 2*time.Minute, 10*time.Second).Should(BeTrue(), "Worker 节点应该有 addon-node 标签")
		})

		// 用例名称：应该支持多标签并验证 addon 亲和性部署
		// 用例步骤：1) 在集群配置中为 redis addon 配置 NodeSelector（例如 `addon-node=redis`）；2) 通过引导节点创建集群并等待 Healthy；3) 获取 redis Pod 所在节点并校验其节点标签满足亲和性。
		// 预期结果：redis Pod 只调度到带有 `addon-node=redis` 标签的节点，且集群总体保持 Healthy。
		It("应该支持多标签并验证 addon 亲和性部署", Label("node-labels-addon-affinity"), SpecTimeout(InstallationItTimeout), func(ctx SpecContext) {
			// 使用基础 addon 列表并添加 redis addon 配置 nodeSelector 亲和性
			clusterConfig.ExtraAddons = append(clusterConfig.ExtraAddons, config.AddonConfig{
				Name:    "redis",
				Version: "6.2.12",
				Block:   false,
				NodeSelector: map[string]string{
					"addon-node": "redis",
				},
			})

			By("生成集群配置文件")
			var err error
			configPath, nodeConfigPath, err = clusterManager.GetConfigGenerator().GenerateAndUpload(clusterConfig)
			Expect(err).NotTo(HaveOccurred(), "应该成功生成配置文件")

			By("执行集群创建命令")
			err = clusterManager.CreateClusterInBackgroundWithKubeconfig(configPath, nodeConfigPath, "")
			Expect(err).NotTo(HaveOccurred(), "创建集群命令应该成功执行")

			By("等待集群状态变为 Healthy")
			Eventually(func() string {
				phase, state, clusterStatus, err := clusterManager.GetClusterFullStatusWithKubeconfig(clusterName, "")
				if err != nil {
					GinkgoWriter.Printf("获取集群状态失败: %v\n", err)
					return ""
				}
				GinkgoWriter.Printf("当前集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
				failOnClusterFailure(state, clusterStatus)
				return state
			}, installTimeout, pollInterval).Should(Equal("Healthy"), "集群应该变为 Healthy 状态")

			By("验证 redis addon Pod 运行在带有正确标签的节点上")
			checker := utils.NewClusterCheckerWithParentKubeconfig(sshExecutor, clusterName, "")
			Eventually(func() bool {
				nodeName, err := checker.GetPodNodeName("", "app.kubernetes.io/name=redis")
				if err != nil || nodeName == "" {
					return false
				}
				GinkgoWriter.Printf("redis Pod 运行在节点: %s\n", nodeName)

				// 验证该节点确实是带有 redis 标签的节点，只有 Worker 节点有这个标签
				hasLabel, _ := checker.CheckNodeHasLabel("addon-node", "redis")
				return hasLabel
			}, 5*time.Minute, 15*time.Second).Should(BeTrue(), "redis 应该部署在带有 addon-node=redis 标签的节点上")
		})

		// 用例名称：应该支持 addon 未设置亲和性时的默认部署
		// 用例步骤：1) 在引导节点侧创建集群并安装 redis addon，但不设置 NodeSelector；2) 等待集群 Healthy；3) 通过检查器找到 redis Pod 节点，验证 Pod 可以正常部署到任意节点。
		// 预期结果：redis Pod 正常存在且运行节点非空，说明未设置亲和性时走默认调度策略。
		It("应该支持 addon 未设置亲和性时的默认部署", Label("node-labels-no-affinity"), SpecTimeout(InstallationItTimeout), func(ctx SpecContext) {
			// 添加 redis 但不设置 NodeSelector
			clusterConfig.ExtraAddons = append(clusterConfig.ExtraAddons, config.AddonConfig{
				Name:    "redis",
				Version: "6.2.12",
			})

			By("生成集群配置文件")
			var err error
			configPath, nodeConfigPath, err = clusterManager.GetConfigGenerator().GenerateAndUpload(clusterConfig)
			Expect(err).NotTo(HaveOccurred(), "应该成功生成配置文件")

			By("执行集群创建命令")
			err = clusterManager.CreateClusterInBackgroundWithKubeconfig(configPath, nodeConfigPath, "")
			Expect(err).NotTo(HaveOccurred(), "创建集群命令应该成功执行")

			By("等待集群状态变为 Healthy")
			Eventually(func() string {
				phase, state, clusterStatus, err := clusterManager.GetClusterFullStatusWithKubeconfig(clusterName, "")
				if err != nil {
					GinkgoWriter.Printf("获取集群状态失败: %v\n", err)
					return ""
				}
				GinkgoWriter.Printf("当前集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
				failOnClusterFailure(state, clusterStatus)
				return state
			}, installTimeout, pollInterval).Should(Equal("Healthy"), "集群应该变为 Healthy 状态")

			By("验证 redis addon Pod 正常部署")
			checker := utils.NewClusterCheckerWithParentKubeconfig(sshExecutor, clusterName, "")
			Eventually(func() bool {
				nodeName, _ := checker.GetPodNodeName("", "app.kubernetes.io/name=redis")
				return nodeName != ""
			}, 5*time.Minute, 15*time.Second).Should(BeTrue(), "未设置亲和性时，redis 应该能正常部署在任意节点")
		})

		// 用例名称：应该在 addon 标签不匹配时不部署至节点
		// 用例步骤：1) 配置 redis addon 的 NodeSelector 为不存在的标签键值；2) 通过引导节点创建集群并等待 Healthy；3) 查询 redis Pod 所在节点，断言不存在调度（nodeName 为空）。
		// 预期结果：当 addon 亲和性标签不匹配时，redis Pod 不会被调度到任何节点，集群仍保持 Healthy。
		It("应该在 addon 标签不匹配时不部署至节点", Label("node-labels-mismatch"), SpecTimeout(InstallationItTimeout), func(ctx SpecContext) {
			// 添加 redis 并设置一个不存在的标签亲和性
			clusterConfig.ExtraAddons = append(clusterConfig.ExtraAddons, config.AddonConfig{
				Name:    "redis",
				Version: "6.2.12",
				NodeSelector: map[string]string{
					"non-existent-key": "some-value",
				},
			})

			By("生成集群配置文件")
			var err error
			configPath, nodeConfigPath, err = clusterManager.GetConfigGenerator().GenerateAndUpload(clusterConfig)
			Expect(err).NotTo(HaveOccurred(), "应该成功生成配置文件")

			By("执行集群创建命令")
			err = clusterManager.CreateClusterInBackgroundWithKubeconfig(configPath, nodeConfigPath, "")
			Expect(err).NotTo(HaveOccurred(), "创建集群命令应该成功执行")

			By("等待集群状态变为 Healthy")
			Eventually(func() string {
				phase, state, clusterStatus, err := clusterManager.GetClusterFullStatusWithKubeconfig(clusterName, "")
				if err != nil {
					GinkgoWriter.Printf("获取集群状态失败: %v\n", err)
					return ""
				}
				GinkgoWriter.Printf("当前集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
				failOnClusterFailure(state, clusterStatus)
				return state
			}, installTimeout, pollInterval).Should(Equal("Healthy"), "集群应该变为 Healthy 状态")

			By("验证 redis addon Pod 未被调度")
			checker := utils.NewClusterCheckerWithParentKubeconfig(sshExecutor, clusterName, "")
			Consistently(func() bool {
				nodeName, _ := checker.GetPodNodeName("", "app.kubernetes.io/name=redis")
				return nodeName == ""
			}, 1*time.Minute, 10*time.Second).Should(BeTrue(), "标签不匹配时，redis Pod 不应该被调度到任何节点")
		})

		// 用例名称：应该忽略包含非法字符的节点标签
		// 用例步骤：1) 在集群配置中向节点添加包含非法字符的 label（例如 `invalid-label` / `!!!#@$`）；2) 通过引导节点创建集群；3) 等待集群 Healthy；4) 检查非法标签是否没有被应用到节点。
		// 预期结果：非法字符的节点标签不会出现在节点标签集合中，集群仍可正常就绪。
		It("应该忽略包含非法字符的节点标签", Label("node-labels-invalid"), SpecTimeout(InstallationItTimeout), func(ctx SpecContext) {
			// 为 Master 节点添加非法标签 (例如包含空格或特殊字符)
			clusterConfig.Nodes[1].Labels = append(clusterConfig.Nodes[1].Labels,
				config.NodeLabel{Key: "invalid-label", Value: "!!!#@$"},
			)

			By("生成集群配置文件")
			var err error
			configPath, nodeConfigPath, err = clusterManager.GetConfigGenerator().GenerateAndUpload(clusterConfig)
			GinkgoWriter.Printf("生成的集群配置文件: %s, 节点配置文件: %s\n", configPath, nodeConfigPath)
			Expect(err).NotTo(HaveOccurred(), "应该成功生成配置文件")

			By("执行集群创建命令")
			err = clusterManager.CreateClusterInBackgroundWithKubeconfig(configPath, nodeConfigPath, "")
			Expect(err).NotTo(HaveOccurred(), "创建集群命令应该成功执行")

			By("等待集群状态变为 Healthy")
			Eventually(func() string {
				phase, state, clusterStatus, err := clusterManager.GetClusterFullStatusWithKubeconfig(clusterName, "")
				if err != nil {
					GinkgoWriter.Printf("获取集群状态失败: %v\n", err)
					return ""
				}
				GinkgoWriter.Printf("当前集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
				failOnClusterFailure(state, clusterStatus)
				return state
			}, installTimeout, pollInterval).Should(Equal("Healthy"), "集群应该变为 Healthy 状态")

			By("验证非法标签未被应用")
			checker := utils.NewClusterCheckerWithParentKubeconfig(sshExecutor, clusterName, "")
			Eventually(func() bool {
				hasLabel, _ := checker.CheckNodeHasLabel("invalid-label", "!!!#@$")
				return hasLabel
			}, 1*time.Minute, 10*time.Second).Should(BeFalse(), "非法字符的标签不应该存在于节点中")
		})
	})

	Describe("Addon 扩展组件安装", Label("addon-installation", "post-init"), func() {
		var (
			clusterName    string
			clusterConfig  *config.BKEClusterConfig
			configPath     string
			nodeConfigPath string
		)

		BeforeEach(func() {
			clusterName = fmt.Sprintf("test-addon-%d", time.Now().Unix())
			nodes := config.LoadTestNodesFromEnv()
			Expect(len(nodes)).To(BeNumerically(">=", 1), "至少需要一个节点")

			masterNode := nodes[0]
			masterNode.Role = []string{"master/node", "etcd"}
			clusterConfig = config.NewDefaultBKEClusterConfig(clusterName, []config.NodeInfo{masterNode})
		})

		AfterEach(func() {
			By("清理测试集群")
			if clusterManager.ClusterExistsWithKubeconfig(clusterName, "") {
				if err := clusterManager.DeleteClusterWithKubeconfig(clusterName, ""); err != nil {
					GinkgoWriter.Printf("Failed to delete cluster: %v\n", err)
				}
				Eventually(func() bool {
					return !clusterManager.ClusterExistsWithKubeconfig(clusterName, "")
				}, uninstallTimeout, 10*time.Second).Should(BeTrue())
			}
			if configPath != "" {
				if err := clusterManager.CleanupConfig(configPath); err != nil {
					GinkgoWriter.Printf("Failed to cleanup config file: %v\n", err)
				}
			}
			if nodeConfigPath != "" {
				if err := clusterManager.CleanupConfig(nodeConfigPath); err != nil {
					GinkgoWriter.Printf("Failed to cleanup node config file: %v\n", err)
				}
			}
		})

		// 用例名称：应该支持从https格式仓库安装chart
		// 用例步骤：1) 在集群配置中添加 nginx 作为 chart addon，并使用 https 格式的 ChartRepo 配置；2) 通过引导节点生成并上传配置；3) 执行 `bke cluster create` 创建集群并等待 Healthy；4) 校验 nginx 命名空间下 PodReady。
		// 预期结果：集群可正常完成安装，nginx 命名空间下的 Pod 全部就绪。
		It("应该支持从https格式仓库安装chart", Label("install-chart-from-https-repo", "skip-temporarily"), SpecTimeout(InstallationItTimeout), func(ctx SpecContext) {
			clusterConfig.ExtraAddons = append(clusterConfig.ExtraAddons, config.AddonConfig{
				Name:        "nginx",
				Version:     "22.6.1",
				Type:        "chart",
				ReleaseName: "nginx",
				Namespace:   "nginx",
			})

			clusterConfig.ChartRepo = config.BitnamiChartRepoConfig()

			By("生成集群配置文件")
			var err error
			configPath, nodeConfigPath, err = clusterManager.GetConfigGenerator().GenerateAndUpload(clusterConfig)
			Expect(err).NotTo(HaveOccurred(), "应该成功生成配置文件")

			By("执行集群创建命令")
			err = clusterManager.CreateClusterInBackgroundWithKubeconfig(configPath, nodeConfigPath, "")
			Expect(err).NotTo(HaveOccurred(), "创建集群命令应该成功执行")

			By("等待集群状态变为 Healthy")
			Eventually(func() string {
				phase, state, clusterStatus, err := clusterManager.GetClusterFullStatusWithKubeconfig(clusterName, "")
				if err != nil {
					GinkgoWriter.Printf("获取集群状态失败: %v\n", err)
					return ""
				}
				GinkgoWriter.Printf("当前集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
				failOnClusterFailure(state, clusterStatus)
				return state
			}, installTimeout, pollInterval).Should(Equal("Healthy"), "集群应该变为 Healthy 状态")

			By("验证nginx组件")
			checker := utils.NewClusterCheckerWithParentKubeconfig(sshExecutor, clusterName, "")

			// 检查 nginx 命名空间下的 Pod 状态
			Eventually(func() bool {
				allReady, notReady, err := checker.CheckNamespacePodsReadyWithKubeconfig("nginx", "")
				if err != nil {
					return false
				}
				if !allReady {
					GinkgoWriter.Printf("等待 nginx Pod 就绪: %s\n", notReady)
					return false
				}
				return true
			}, 5*time.Minute, 10*time.Second).Should(BeTrue(), "nginx 命名空间下的 Pod 应该全部就绪")
		})

		// 用例名称：应该支持从http格式仓库安装chart
		// 用例步骤：1) 配置 ChartRepo 为 http 源（必要时先安装 plugin-management-service）；2) 通过引导节点生成并上传集群配置；3) 执行 `bke cluster create` 创建集群并等待 Healthy；4) 校验 logging 相关 chart 组件 Pod 是否就绪。
		// 预期结果：http 仓库安装流程成功，logging/loki/logging-operator 等关键组件均能正常部署并运行。
		It("应该支持从http格式仓库安装chart", Label("install-chart-from-http-repo"), SpecTimeout(InstallationItTimeout), func(ctx SpecContext) {
			hasPluginManagementService := false
			for _, addon := range clusterConfig.Addons {
				if addon.Name == "plugin-management-service" || addon.Name == "openfuyao-system-controller" {
					hasPluginManagementService = true
				}
			}

			// 不安装plugin-management-service直接安装logging-package会失败，所以安装plugin-management-service
			if !hasPluginManagementService {
				clusterConfig.ExtraAddons = append(clusterConfig.ExtraAddons, config.AddonConfig{
					Name:        "plugin-management-service",
					Version:     "0.0.0-latest",
					ReleaseName: "plugin-management",
					Namespace:   "openfuyao-system",
					Type:        "chart",
				})
			}

			clusterConfig.ExtraAddons = append(clusterConfig.ExtraAddons, config.AddonConfig{
				Name:        "logging-package",
				Version:     "0.0.0-latest",
				ReleaseName: "logging",
				Type:        "chart",
				Namespace:   "logging",
			})

			clusterConfig.ChartRepo = config.GuideNodeChartRepoConfig()

			By("生成集群配置文件")
			var err error
			configPath, nodeConfigPath, err = clusterManager.GetConfigGenerator().GenerateAndUpload(clusterConfig)
			Expect(err).NotTo(HaveOccurred(), "应该成功生成配置文件")

			By("执行集群创建命令")
			err = clusterManager.CreateClusterInBackgroundWithKubeconfig(configPath, nodeConfigPath, "")
			Expect(err).NotTo(HaveOccurred(), "创建集群命令应该成功执行")

			By("等待集群状态变为 Healthy")
			Eventually(func() string {
				phase, state, clusterStatus, err := clusterManager.GetClusterFullStatusWithKubeconfig(clusterName, "")
				if err != nil {
					GinkgoWriter.Printf("获取集群状态失败: %v\n", err)
					return ""
				}
				GinkgoWriter.Printf("当前集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
				failOnClusterFailure(state, clusterStatus)
				return state
			}, installTimeout, pollInterval).Should(Equal("Healthy"), "集群应该变为 Healthy 状态")

			By("验证日志组件")
			checker := utils.NewClusterCheckerWithParentKubeconfig(sshExecutor, clusterName, "")

			// 验证 logging-website 组件已经安装
			for _, component := range []string{"proxy-logging"} {
				Eventually(func() bool {
					for _, labelKey := range []string{"app"} {
						nodeName, _ := checker.GetPodNodeName("logging", fmt.Sprintf("%s=%s", labelKey, component))
						if nodeName != "" {
							return true
						}
					}
					return false
				}, 2*time.Minute, 10*time.Second).Should(BeTrue(), fmt.Sprintf("应该能找到 %s 组件", component))
			}

			// 验证 loki 组件已经安装
			for _, component := range []string{"loki"} {
				Eventually(func() bool {
					for _, labelKey := range []string{"app", "name"} {
						nodeName, _ := checker.GetPodNodeName("loki", fmt.Sprintf("%s=%s", labelKey, component))
						if nodeName != "" {
							return true
						}
					}
					return false
				}, 2*time.Minute, 10*time.Second).Should(BeTrue(), fmt.Sprintf("应该能找到 %s 组件", component))
			}

			// 验证 logging-operator 组件已经安装
			for _, component := range []string{"logging-operator"} {
				Eventually(func() bool {
					for _, labelKey := range []string{"app"} {
						nodeName, _ := checker.GetPodNodeName("openfuyao-system", fmt.Sprintf("%s=%s", labelKey, component))
						if nodeName != "" {
							return true
						}
					}
					return false
				}, 2*time.Minute, 10*time.Second).Should(BeTrue(), fmt.Sprintf("应该能找到 %s 组件", component))
			}
		})

		// 用例名称：配置chart仓库不存在的chart
		// 用例步骤：1) 将一个不存在的 chart（如 `elon-musk`）加入 addon 配置；2) 配置 ChartRepo 并通过引导节点生成/上传集群配置；3) 执行 `bke cluster create`；4) 校验该 chart 对应命名空间下无 Pod。
		// 预期结果：集群整体仍可达到 Healthy，但不存在的 chart 不会成功部署，命名空间下无 Pod。
		It("配置chart仓库不存在的chart", Label("set-chart-repo-not-exist-chart"), SpecTimeout(InstallationItTimeout), func(ctx SpecContext) {
			clusterConfig.ExtraAddons = append(clusterConfig.ExtraAddons, config.AddonConfig{
				Name:        "elon-musk",
				Version:     "0.0.0-elon-musk",
				ReleaseName: "elon-musk",
				Namespace:   "elon-musk-1",
				Type:        "chart",
			})
			clusterConfig.ChartRepo = config.OpenFuyaoChartRepoConfig()

			By("生成集群配置文件")
			var err error
			configPath, nodeConfigPath, err = clusterManager.GetConfigGenerator().GenerateAndUpload(clusterConfig)
			Expect(err).NotTo(HaveOccurred(), "应该成功生成配置文件")

			By("执行集群创建命令")
			err = clusterManager.CreateClusterInBackgroundWithKubeconfig(configPath, nodeConfigPath, "")
			Expect(err).NotTo(HaveOccurred(), "创建集群命令应该成功执行")

			By("等待集群状态变为 Healthy")
			Eventually(func() string {
				phase, state, clusterStatus, err := clusterManager.GetClusterFullStatusWithKubeconfig(clusterName, "")
				if err != nil {
					GinkgoWriter.Printf("获取集群状态失败: %v\n", err)
					return ""
				}
				GinkgoWriter.Printf("当前集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
				failOnClusterFailure(state, clusterStatus)
				return state
			}, installTimeout, pollInterval).Should(Equal("Healthy"), "集群应该变为 Healthy 状态")

			By("验证组件是否安装")
			checker := utils.NewClusterCheckerWithParentKubeconfig(sshExecutor, clusterName, "")
			// 验证 elon-musk 空间下是否有Pod
			_, notReady, _ := checker.CheckNamespacePodsReadyWithKubeconfig("elon-musk-1", "")
			Expect(notReady).Should(Equal("no pods found"), "elon-musk-1 命名空间下应该无Pod")
		})

		// 用例名称：配置异常chart仓库应该安装集群失败
		// 用例步骤：1) 配置一个异常的 ChartRepo（domain/IP/port 等不合法或不可达）；2) 通过引导节点生成并上传集群配置；3) 执行 `bke cluster create`；4) 验证集群 BC/组件未能正确创建。
		// 预期结果：由于 chart 仓库异常，相关 BC 不存在或获取失败，集群创建流程无法完成预期组件部署。
		It("配置异常chart仓库应该安装集群失败", Label("set-err-chart-repo-install-cluster-failed"), SpecTimeout(InstallationItTimeout), func(ctx SpecContext) {
			clusterConfig.ExtraAddons = append(clusterConfig.ExtraAddons, config.AddonConfig{
				Name:        "plugin-management-service",
				Version:     "0.0.0-latest",
				ReleaseName: "plugin-management",
				Namespace:   "openfuyao-system",
				Type:        "chart",
			})
			clusterConfig.ChartRepo = &config.ChartRepoConfig{
				Domain: "my.chart.repo.elon.musk",
				IP:     "192.168.1",
				Port:   "443z",
				Prefix: "",
			}

			By("生成集群配置文件")
			var err error
			configPath, nodeConfigPath, err = clusterManager.GetConfigGenerator().GenerateAndUpload(clusterConfig)
			Expect(err).NotTo(HaveOccurred(), "应该成功生成配置文件")

			By("执行集群创建命令")
			err = clusterManager.CreateClusterInBackgroundWithKubeconfig(configPath, nodeConfigPath, "")
			Expect(err).NotTo(HaveOccurred(), "创建集群命令应该未报错，但是未创建集群")

			By("获取bc")
			_, err = clusterManager.GetBkeCluster(clusterConfig.ClusterName, clusterConfig.Namespace)
			Expect(err).To(HaveOccurred(), "应该不能获取到bc")
		})

		// 用例名称：应该支持安装chart包时配置values.yaml
		// 用例步骤：1) 在 chart addon 配置中提供 `values.yaml`（例如 logging-operator/loki-stack 的镜像仓库、卷挂载、规则等）；2) 通过引导节点生成并上传集群配置；3) 执行 `bke cluster create` 创建集群并等待 Healthy；4) 校验 values.yaml 相关组件是否部署成功并能反映预期配置。
		// 预期结果：使用 values.yaml 的 chart addon 能成功安装，相关组件 Pod 可查询到且运行符合配置期望。
		It("应该支持安装chart包时配置values.yaml", Label("set-values-yaml-and-install-chart"), SpecTimeout(InstallationItTimeout), func(ctx SpecContext) {
			valuesYaml := `
logging-operator:
  namespace:
    backend: logging
  images:
    core:
      repository: "cr.openfuyao.cn/openfuyao/logging-operator"
      pullPolicy: Always
loki-stack:
  namespace: loki
  promtail:
    defaultVolumes:
      - name: containers
        hostPath:
          path: /var/lib/docker/containers
      - name: pods
        hostPath:
          path: /var/log/pods
      - name: random
        hostPath:
          path: /var/log/random
    defaultVolumeMounts:
      - name: containers
        mountPath: /var/lib/docker/containers
        readOnly: true
      - name: pods
        mountPath: /var/log/pods
        readOnly: true
      - name: random
        mountPath: /var/log/random
        readOnly: true
  loki:
    config:
      limits_config:
        reject_old_samples_max_age: 168h
      table_manager:
        retention_deletes_enabled: true
        retention_period: 240h
    alerting_groups:
      - name: GenericHighErrorRate
        rules:
          - alert: GenericHighErrorRate
            expr: |
              sum by (job, instance) (rate({job=~".+"} |= "error" [5m])) > 0
            for: 5m
            labels:
              severity: critical
              loki: logging/k8s.io
            annotations:
              summary: "High error rate detected in {{ $labels.job }} instance {{ $labels.instance }}"
              description: "Job {{ $labels.job }} on instance {{ $labels.instance }} has a high rate of error logs."
      - name: GenericExceptionDetected
        rules:
          - alert: GenericExceptionDetected
            expr: |
              sum by (job, instance) (rate({job=~".+"} |= "Exception" [5m])) > 0
            for: 5m
            labels:
              severity: warning
              loki: logging/k8s.io
            annotations:
              summary: "Exception detected in {{ $labels.job }} instance {{ $labels.instance }}"
              description: "Job {{ $labels.job }} on instance {{ $labels.instance }} has logs containing 'Exception'."
      - name: GenericLogVolumeSpike
        rules:
          - alert: GenericLogVolumeSpike
            expr: |
              sum by (job, instance) (rate({job=~".+"}[5m])) > 1
            for: 5m
            labels:
              severity: warning
              loki: logging/k8s.io
            annotations:
              summary: "Log volume spike detected in {{ $labels.job }} instance {{ $labels.instance }}"
              description: "Job {{ $labels.job }} on instance {{ $labels.instance }} has a spike in log volume."
      - name: GenericErrorRateThresholdExceeded
        rules:
          - alert: GenericErrorRateThresholdExceeded
            expr: |
              (sum by (job, instance) (rate({job=~".+"} |= "error" [5m])) / sum by (job, instance) (rate({job=~".+"}[5m]))) > 0.05
            for: 5m
            labels:
              severity: critical
              loki: logging/k8s.io
            annotations:
              summary: "High error rate threshold exceeded in {{ $labels.job }} instance {{ $labels.instance }}"
              description: "Job {{ $labels.job }} on instance {{ $labels.instance }} has an error rate exceeding the threshold."
      - name: GenericServiceUnavailable
        rules:
          - alert: GenericServiceUnavailable
            expr: |
              sum by (job, instance) (rate({job=~".+"} |= "service unavailable" [5m])) > 0
            for: 5m
            labels:
              severity: critical
              loki: logging/k8s.io
            annotations:
              summary: "Service unavailable detected in {{ $labels.job }} instance {{ $labels.instance }}"
              description: "Job {{ $labels.job }} on instance {{ $labels.instance }} has logs indicating 'service unavailable'."
      - name: GenericApplicationStartupFailure
        rules:
          - alert: GenericApplicationStartupFailure
            expr: |
              sum by (job, instance) (rate({job=~".+"} |= "startup failure" [5m])) > 0
            for: 5m
            labels:
              severity: critical
              loki: logging/k8s.io
            annotations:
              summary: "Application startup failure detected in {{ $labels.job }} instance {{ $labels.instance }}"
              description: "Job {{ $labels.job }} on instance {{ $labels.instance }} has logs indicating 'startup failure'."
proxy-logging:
  namespace: logging
  enableOAuth: true
  images:
    core:
      repository: "cr.openfuyao.cn/openfuyao/logging-website"
      pullPolicy: Always
    oauth:
      repository: "cr.openfuyao.cn/openfuyao/oauth-proxy"
      pullPolicy: Always
`

			cmNamespace := fmt.Sprintf("elon-%s", clusterConfig.Namespace)
			cmName := fmt.Sprintf("elon-%s", clusterName)
			err := resourceManager.CreateConfigMap(cmName, cmNamespace, "values.yaml", valuesYaml)
			Expect(err).NotTo(HaveOccurred(), "创建values.yaml configMap失败")

			hasPluginManagementService := false
			for _, addon := range clusterConfig.Addons {
				if addon.Name == "plugin-management-service" || addon.Name == "openfuyao-system-controller" {
					hasPluginManagementService = true
				}
			}

			// 不安装plugin-management-service直接安装logging-package会失败，所以安装plugin-management-service
			if !hasPluginManagementService {
				clusterConfig.ExtraAddons = append(clusterConfig.ExtraAddons, config.AddonConfig{
					Name:        "plugin-management-service",
					Version:     "0.0.0-latest",
					ReleaseName: "plugin-management",
					Namespace:   "openfuyao-system",
					Type:        "chart",
					Block:       true,
				})
			}

			clusterConfig.ExtraAddons = append(clusterConfig.ExtraAddons, config.AddonConfig{
				Name:        "logging-package",
				Version:     "1.0.3",
				Namespace:   "logging",
				ReleaseName: "logging",
				Type:        "chart",
				ValuesConfigMapRef: &config.ValuesConfigMapRef{
					Name:      cmName,
					Namespace: cmNamespace,
				},
			})

			clusterConfig.ChartRepo = config.OpenFuyaoChartRepoConfig()

			By("生成集群配置文件")
			configPath, nodeConfigPath, err = clusterManager.GetConfigGenerator().GenerateAndUpload(clusterConfig)
			Expect(err).NotTo(HaveOccurred(), "应该成功生成配置文件")

			By("执行集群创建命令")
			err = clusterManager.CreateClusterInBackgroundWithKubeconfig(configPath, nodeConfigPath, "")
			Expect(err).NotTo(HaveOccurred(), "创建集群命令应该成功执行")

			By("等待集群状态变为 Healthy")
			Eventually(func() string {
				phase, state, clusterStatus, err := clusterManager.GetClusterFullStatusWithKubeconfig(clusterName, "")
				if err != nil {
					GinkgoWriter.Printf("获取集群状态失败: %v\n", err)
					return ""
				}
				GinkgoWriter.Printf("当前集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
				failOnClusterFailure(state, clusterStatus)
				return state
			}, installTimeout, pollInterval).Should(Equal("Healthy"), "集群应该变为 Healthy 状态")

			By("验证日志组件")
			checker := utils.NewClusterCheckerWithParentKubeconfig(sshExecutor, clusterName, "")

			// 检查 loki 命名空间下的 Pod 状态
			Eventually(func() bool {
				allReady, notReady, err := checker.CheckNamespacePodsReadyWithKubeconfig("loki", "")
				if err != nil {
					return false
				}
				if !allReady {
					GinkgoWriter.Printf("等待 loki Pod 就绪: %s\n", notReady)
					return false
				}
				return true
			}, 5*time.Minute, 10*time.Second).Should(BeTrue(), "loki 命名空间下的 Pod 应该全部就绪")

			// 进一步验证具体的 logging-package 组件
			for _, component := range []string{"proxy-logging", "logging-operator"} {
				Eventually(func() bool {
					for _, labelKey := range []string{"app"} {
						nodeName, _ := checker.GetPodNodeName("logging", fmt.Sprintf("%s=%s", labelKey, component))
						if nodeName != "" {
							return true
						}
					}
					return false
				}, 2*time.Minute, 10*time.Second).Should(BeTrue(), fmt.Sprintf("应该能找到 %s 组件", component))
			}
		})

		// 用例名称：应该支持从oci格式chart仓库安装、升级、卸载 chart
		// 用例步骤：1) 在引导节点侧创建集群并保证集群 Healthy；2) 将 `logging-package` 以 OCI chart 形式安装为 addon（logging-operator 等组件）；3) 校验相关命名空间 Pod 就绪；4) 升级 chart 版本并验证 Deployment 镜像 tag；5) 卸载 chart addon 并校验 Pod 被删除。
		// 预期结果：安装/升级/卸载三阶段均生效，卸载后对应命名空间不再存在相关组件 Pod。
		It("应该支持从oci格式chart仓库安装、升级、卸载 chart", Label("install-upgrade-delete-chart-from-oci-repo"), SpecTimeout(InstallationItTimeout), func(ctx SpecContext) {
			// 1. 创建集群

			hasPluginManagementService := false
			for _, addon := range clusterConfig.Addons {
				if addon.Name == "plugin-management-service" || addon.Name == "openfuyao-system-controller" {
					hasPluginManagementService = true
				}
			}

			// 不安装plugin-management-service直接安装logging-package会失败，所以安装plugin-management-service
			if !hasPluginManagementService {
				clusterConfig.ExtraAddons = append(clusterConfig.ExtraAddons, config.AddonConfig{
					Name:        "plugin-management-service",
					Version:     "0.0.0-latest",
					ReleaseName: "plugin-management",
					Namespace:   "openfuyao-system",
					Type:        "chart",
					Block:       true,
				})
			}
			clusterConfig.ChartRepo = config.OpenFuyaoChartRepoConfig()

			By("生成集群配置文件")
			var err error
			configPath, nodeConfigPath, err = clusterManager.GetConfigGenerator().GenerateAndUpload(clusterConfig)
			Expect(err).NotTo(HaveOccurred(), "应该成功生成配置文件")

			By("执行集群创建命令")
			err = clusterManager.CreateClusterInBackgroundWithKubeconfig(configPath, nodeConfigPath, "")
			Expect(err).NotTo(HaveOccurred(), "创建集群命令应该成功执行")

			By("等待集群状态变为 Healthy")
			Eventually(func() string {
				phase, state, clusterStatus, err := clusterManager.GetClusterFullStatusWithKubeconfig(clusterName, "")
				if err != nil {
					GinkgoWriter.Printf("获取集群状态失败: %v\n", err)
					return ""
				}
				GinkgoWriter.Printf("当前集群状态: phase=%s, state=%s, clusterStatus=%s\n", phase, state, clusterStatus)
				failOnClusterFailure(state, clusterStatus)
				return state
			}, installTimeout, pollInterval).Should(Equal("Healthy"), "集群应该变为 Healthy 状态")

			// 2. 修改bc cr，安装日志组件
			By("安装日志组件")
			logAddon := config.AddonConfig{
				Name:        "logging-package",
				Version:     "1.0.2",
				ReleaseName: "logging",
				Type:        "chart",
				Namespace:   "logging",
			}
			err = clusterManager.AddNewAddons(clusterName, clusterConfig.Namespace, logAddon)
			Expect(err).NotTo(HaveOccurred(), "已经添加日志组件")

			By("验证日志组件")
			checker := utils.NewClusterCheckerWithParentKubeconfig(sshExecutor, clusterName, "")

			// 检查logging-website状态 (在 logging 命名空间)
			By("验证logging-package组件")
			Eventually(func() bool {
				allReady, notReady, err := checker.CheckNamespacePodsReadyWithKubeconfig("logging", "")
				if err != nil {
					return false
				}
				if !allReady {
					GinkgoWriter.Printf("等待 logging-package Pod 就绪: %s\n", notReady)
					return false
				}
				return true
			}, 10*time.Minute, 10*time.Second).Should(BeTrue(), "logging 命名空间下的 Pod 应该全部就绪")

			// 检查 openfuyao-system 命名空间下的 Pod 状态
			By("验证openfuyao-system命名空间下的组件")
			Eventually(func() bool {
				allReady, notReady, err := checker.CheckNamespacePodsReadyWithKubeconfig("openfuyao-system", "")
				if err != nil {
					return false
				}
				if !allReady {
					GinkgoWriter.Printf("等待 openfuyao-system Pod 就绪: %s\n", notReady)
					return false
				}
				return true
			}, 5*time.Minute, 10*time.Second).Should(BeTrue(), "openfuyao-system 命名空间下的 Pod 应该全部就绪")

			// 检查 loki 命名空间下的 Pod 状态
			By("验证loki组件")
			Eventually(func() bool {
				allReady, notReady, err := checker.CheckNamespacePodsReadyWithKubeconfig("loki", "")
				if err != nil {
					return false
				}
				if !allReady {
					GinkgoWriter.Printf("等待 loki Pod 就绪: %s\n", notReady)
					return false
				}
				return true
			}, 5*time.Minute, 10*time.Second).Should(BeTrue(), "loki 命名空间下的 Pod 应该全部就绪")

			// 进一步验证具体的 logging-package 组件
			for _, component := range []string{"proxy-logging"} {
				Eventually(func() bool {
					for _, labelKey := range []string{"app"} {
						nodeName, _ := checker.GetPodNodeName("logging", fmt.Sprintf("%s=%s", labelKey, component))
						if nodeName != "" {
							return true
						}
					}
					return false
				}, 2*time.Minute, 10*time.Second).Should(BeTrue(), fmt.Sprintf("应该能找到 %s 组件", component))
			}

			// 2. 升级chart包

			By("修改日志组件版本号")
			err = clusterManager.PatchBkeClusterAddonVersion(clusterName, clusterConfig.Namespace,
				"logging-package", "1.0.3")
			Expect(err).NotTo(HaveOccurred(), "日志组件版本号修改成功")

			// 检查 openfuyao-system 命名空间下的 Pod 状态
			Eventually(func() bool {
				allReady, notReady, err := checker.CheckNamespacePodsReadyWithKubeconfig("openfuyao-system", "")
				if err != nil {
					return false
				}
				if !allReady {
					GinkgoWriter.Printf("等待 openfuyao-system Pod 就绪: %s\n", notReady)
					return false
				}
				return true
			}, 5*time.Minute, 10*time.Second).Should(BeTrue(), "openfuyao-system 命名空间下的 pod 应该全部就绪")

			By("获取日志组件版本号")
			Eventually(func() bool {
				images, _ := checker.GetDeploymentImages("openfuyao-system", "logging-operator")
				for _, image := range images {
					tag := utils.GetImageTag(image)
					return tag == "1.0.3"
				}
				return false
			}, 3*time.Minute, 10*time.Second).Should(BeTrue(), "日志组件版本号应该为1.0.3")

			// 3. 卸载chart包

			By("卸载日志组件")
			err = clusterManager.RemoveBkeClusterAddonByName(clusterName, clusterConfig.Namespace, "logging-package")
			Expect(err).NotTo(HaveOccurred(), "从bc中删除日志组成功")

			// 检查 logging 命名空间下是否有Pod
			By("验证logging命名空间下的Pod被删除")
			Eventually(func() bool {
				hasPod, _ := checker.IsNamespaceHasPodsWithKubeconfig("logging", "")
				return hasPod
			}, 3*time.Minute, 10*time.Second).Should(BeFalse(), "logging命名空间下应该没有pod")

			// 检查 openfuyao-system 命名空间下是否有 logging-operator Pod
			Eventually(func() bool {
				hasPod, _ := checker.IsNamespaceHasPodByNameWithKubeconfig("openfuyao-system", "logging-operator", "")
				return hasPod
			}, 3*time.Minute, 10*time.Second).Should(BeFalse(), "openfuyao-system命名空间下应该没有pod")
		})
	})

})