logos-blockchain-testing/testing-framework/deployers/compose/src/deployer/orchestrator.rs

use std::{env, sync::Arc, time::Duration};

use reqwest::Url;
use testing_framework_core::{
    scenario::{
        Application, ApplicationExternalProvider, CleanupGuard, ClusterControlProfile, ClusterMode,
        ClusterWaitHandle, DeploymentPolicy, DynError, ExistingCluster, FeedHandle, FeedRuntime,
        HttpReadinessRequirement, Metrics, NodeClients, NodeControlHandle,
        ObservabilityCapabilityProvider, ObservabilityInputs, RequiresNodeControl, Runner,
        RuntimeAssembly, Scenario, SourceOrchestrationPlan, SourceProviders, StaticManagedProvider,
        build_source_orchestration_plan, orchestrate_sources_with_providers,
    },
    topology::DeploymentDescriptor,
};
use tracing::info;

use super::{
    ComposeDeployer, ComposeDeploymentMetadata,
    attach_provider::{ComposeAttachProvider, ComposeAttachedClusterWait},
    clients::ClientBuilder,
    make_cleanup_guard,
    ports::PortManager,
    readiness::ReadinessChecker,
    setup::{DeploymentContext, DeploymentSetup},
};
use crate::{
    docker::control::{ComposeAttachedNodeControl, ComposeNodeControl},
    env::ComposeDeployEnv,
    errors::ComposeRunnerError,
    infrastructure::{
        environment::StackEnvironment,
        ports::{HostPortMapping, compose_runner_host},
    },
    lifecycle::block_feed::spawn_block_feed_with_retry,
};

const PRINT_ENDPOINTS_ENV: &str = "TESTNET_PRINT_ENDPOINTS";

pub struct DeploymentOrchestrator<E: ComposeDeployEnv> {
    deployer: ComposeDeployer<E>,
}

impl<E: ComposeDeployEnv> DeploymentOrchestrator<E> {
    pub const fn new(deployer: ComposeDeployer<E>) -> Self {
        Self { deployer }
    }

    pub async fn deploy<Caps>(
        &self,
        scenario: &Scenario<E, Caps>,
    ) -> Result<Runner<E>, ComposeRunnerError>
    where
        Caps: RequiresNodeControl + ObservabilityCapabilityProvider + Send + Sync,
    {
        self.deploy_with_metadata(scenario)
            .await
            .map(|(runner, _)| runner)
    }

    pub async fn deploy_with_metadata<Caps>(
        &self,
        scenario: &Scenario<E, Caps>,
    ) -> Result<(Runner<E>, ComposeDeploymentMetadata), ComposeRunnerError>
    where
        Caps: RequiresNodeControl + ObservabilityCapabilityProvider + Send + Sync,
    {
        validate_supported_cluster_mode(scenario).map_err(|source| {
            ComposeRunnerError::SourceOrchestration {
                source: source.into(),
            }
        })?;

        // Source planning is currently resolved here before deployer-specific setup.
        let source_plan = build_source_orchestration_plan(scenario).map_err(|source| {
            ComposeRunnerError::SourceOrchestration {
                source: source.into(),
            }
        })?;

        if matches!(scenario.cluster_mode(), ClusterMode::ExistingCluster) {
            return self
                .deploy_existing_cluster::<Caps>(scenario, source_plan)
                .await
                .map(|runner| (runner, existing_cluster_metadata(scenario)));
        }

        let deployment = scenario.deployment();
        let setup = DeploymentSetup::<E>::new(deployment);
        setup.validate_environment().await?;

        let observability = resolve_observability_inputs(scenario)?;
        let mut prepared = prepare_deployment::<E>(setup, &observability).await?;
        let deployment_policy = scenario.deployment_policy();
        let readiness_enabled =
            self.deployer.readiness_checks && deployment_policy.readiness_enabled;

        self.log_deploy_start(
            scenario,
            &prepared.descriptors,
            deployment_policy,
            &observability,
        );

        let mut deployed = deploy_nodes::<E>(
            &mut prepared.environment,
            &prepared.descriptors,
            readiness_enabled,
            deployment_policy.readiness_requirement,
        )
        .await?;

        let source_providers = self.source_providers(deployed.node_clients.snapshot());

        deployed.node_clients = self
            .resolve_node_clients(&source_plan, source_providers)
            .await?;

        let project_name = prepared.environment.project_name().to_owned();

        let runner = self
            .build_runner::<Caps>(
                scenario,
                prepared,
                deployed,
                observability,
                readiness_enabled,
                project_name.clone(),
            )
            .await?;

        self.log_deploy_ready(
            scenario,
            deployment_policy,
            deployment.node_count(),
            &compose_runner_host(),
            readiness_enabled,
        );

        Ok((
            runner,
            ComposeDeploymentMetadata {
                project_name: Some(project_name),
            },
        ))
    }

    async fn deploy_existing_cluster<Caps>(
        &self,
        scenario: &Scenario<E, Caps>,
        source_plan: SourceOrchestrationPlan,
    ) -> Result<Runner<E>, ComposeRunnerError>
    where
        Caps: RequiresNodeControl + ObservabilityCapabilityProvider + Send + Sync,
    {
        let observability = resolve_observability_inputs(scenario)?;
        let source_providers = self.source_providers(Vec::new());

        let node_clients = self
            .resolve_node_clients(&source_plan, source_providers)
            .await?;

        self.ensure_non_empty_node_clients(&node_clients)?;

        let node_control = self.attached_node_control::<Caps>(scenario)?;
        let cluster_wait = self.attached_cluster_wait(scenario)?;
        let (feed, feed_task) = spawn_block_feed_with_retry::<E>(&node_clients).await?;
        let assembly = build_runtime_assembly(
            scenario.deployment().clone(),
            node_clients,
            scenario.duration(),
            scenario.expectation_cooldown(),
            scenario.cluster_control_profile(),
            observability.telemetry_handle()?,
            feed,
            node_control,
            cluster_wait,
        );

        let cleanup_guard: Box<dyn CleanupGuard> = Box::new(feed_task);
        Ok(assembly.build_runner(Some(cleanup_guard)))
    }

    fn source_providers(&self, managed_clients: Vec<E::NodeClient>) -> SourceProviders<E> {
        SourceProviders::default()
            .with_managed(Arc::new(StaticManagedProvider::new(managed_clients)))
            .with_attach(Arc::new(ComposeAttachProvider::<E>::new(
                compose_runner_host(),
            )))
            .with_external(Arc::new(ApplicationExternalProvider))
    }

    async fn resolve_node_clients(
        &self,
        source_plan: &SourceOrchestrationPlan,
        source_providers: SourceProviders<E>,
    ) -> Result<NodeClients<E>, ComposeRunnerError> {
        orchestrate_sources_with_providers(source_plan, source_providers)
            .await
            .map_err(|source| ComposeRunnerError::SourceOrchestration { source })
    }

    fn ensure_non_empty_node_clients(
        &self,
        node_clients: &NodeClients<E>,
    ) -> Result<(), ComposeRunnerError> {
        if node_clients.is_empty() {
            return Err(ComposeRunnerError::RuntimePreflight);
        }

        Ok(())
    }

    fn attached_node_control<Caps>(
        &self,
        scenario: &Scenario<E, Caps>,
    ) -> Result<Option<Arc<dyn NodeControlHandle<E>>>, ComposeRunnerError>
    where
        Caps: RequiresNodeControl + Send + Sync,
    {
        if !Caps::REQUIRED {
            return Ok(None);
        }

        let attach = scenario
            .existing_cluster()
            .ok_or(ComposeRunnerError::InternalInvariant {
                message: "existing-cluster node control requested outside existing-cluster mode",
            })?;
        let node_control = ComposeAttachedNodeControl::try_from_existing_cluster(attach)
            .map_err(|source| ComposeRunnerError::SourceOrchestration { source })?;

        Ok(Some(Arc::new(node_control) as Arc<dyn NodeControlHandle<E>>))
    }

    fn attached_cluster_wait<Caps>(
        &self,
        scenario: &Scenario<E, Caps>,
    ) -> Result<Arc<dyn ClusterWaitHandle<E>>, ComposeRunnerError>
    where
        Caps: Send + Sync,
    {
        let attach = scenario
            .existing_cluster()
            .ok_or(ComposeRunnerError::InternalInvariant {
                message: "compose cluster wait requested outside existing-cluster mode",
            })?;
        let cluster_wait = ComposeAttachedClusterWait::<E>::try_new(compose_runner_host(), attach)
            .map_err(|source| ComposeRunnerError::SourceOrchestration { source })?;

        Ok(Arc::new(cluster_wait))
    }

    async fn build_runner<Caps>(
        &self,
        scenario: &Scenario<E, Caps>,
        mut prepared: PreparedDeployment<E>,
        deployed: DeployedNodes<E>,
        observability: ObservabilityInputs,
        readiness_enabled: bool,
        project_name: String,
    ) -> Result<Runner<E>, ComposeRunnerError>
    where
        Caps: RequiresNodeControl + ObservabilityCapabilityProvider + Send + Sync,
    {
        let telemetry = observability.telemetry_handle()?;
        let node_control = self.maybe_node_control::<Caps>(&prepared.environment);
        let cluster_wait =
            self.managed_cluster_wait(ComposeDeploymentMetadata::for_project(project_name))?;

        log_observability_endpoints(&observability);
        log_profiling_urls(&deployed.host, &deployed.host_ports);
        maybe_print_endpoints(&observability, &deployed.host, &deployed.host_ports);

        let input = RuntimeBuildInput {
            deployed: &deployed,
            descriptors: prepared.descriptors.clone(),
            duration: scenario.duration(),
            expectation_cooldown: scenario.expectation_cooldown(),
            cluster_control_profile: scenario.cluster_control_profile(),
            telemetry,
            environment: &mut prepared.environment,
            node_control,
            cluster_wait,
        };
        let runtime = build_compose_runtime::<E>(input).await?;
        let cleanup_guard =
            make_cleanup_guard(prepared.environment.into_cleanup()?, runtime.feed_task);

        info!(
            effective_readiness = readiness_enabled,
            host = deployed.host,
            "compose runtime prepared"
        );

        Ok(runtime.assembly.build_runner(Some(cleanup_guard)))
    }

    fn maybe_node_control<Caps>(
        &self,
        environment: &StackEnvironment,
    ) -> Option<Arc<dyn NodeControlHandle<E>>>
    where
        Caps: RequiresNodeControl + Send + Sync,
    {
        Caps::REQUIRED.then(|| {
            Arc::new(ComposeNodeControl {
                compose_file: environment.compose_path().to_path_buf(),
                project_name: environment.project_name().to_owned(),
            }) as Arc<dyn NodeControlHandle<E>>
        })
    }

    fn managed_cluster_wait(
        &self,
        metadata: ComposeDeploymentMetadata,
    ) -> Result<Arc<dyn ClusterWaitHandle<E>>, ComposeRunnerError> {
        let existing_cluster = metadata
            .existing_cluster()
            .map_err(|source| ComposeRunnerError::SourceOrchestration { source })?;
        let cluster_wait =
            ComposeAttachedClusterWait::<E>::try_new(compose_runner_host(), &existing_cluster)
                .map_err(|source| ComposeRunnerError::SourceOrchestration { source })?;

        Ok(Arc::new(cluster_wait))
    }

    fn log_deploy_start<Caps>(
        &self,
        scenario: &Scenario<E, Caps>,
        descriptors: &E::Deployment,
        deployment_policy: DeploymentPolicy,
        observability: &ObservabilityInputs,
    ) {
        let effective_readiness =
            self.deployer.readiness_checks && deployment_policy.readiness_enabled;

        info!(
            nodes = descriptors.node_count(),
            duration_secs = scenario.duration().as_secs(),
            readiness_checks = self.deployer.readiness_checks,
            readiness_enabled = deployment_policy.readiness_enabled,
            readiness_requirement = ?deployment_policy.readiness_requirement,
            effective_readiness,
            metrics_query_url = observability.metrics_query_url.as_ref().map(|u| u.as_str()),
            metrics_otlp_ingest_url = observability
                .metrics_otlp_ingest_url
                .as_ref()
                .map(|u| u.as_str()),
            grafana_url = observability.grafana_url.as_ref().map(|u| u.as_str()),
            "compose deployment starting"
        );
    }

    fn log_deploy_ready<Caps>(
        &self,
        scenario: &Scenario<E, Caps>,
        deployment_policy: DeploymentPolicy,
        node_count: usize,
        host: &str,
        readiness_enabled: bool,
    ) {
        info!(
            nodes = node_count,
            duration_secs = scenario.duration().as_secs(),
            readiness_checks = self.deployer.readiness_checks,
            readiness_enabled = deployment_policy.readiness_enabled,
            readiness_requirement = ?deployment_policy.readiness_requirement,
            effective_readiness = readiness_enabled,
            host,
            "compose deployment ready; handing control to scenario runner"
        );
    }
}

<<<<<<< HEAD
fn validate_supported_cluster_mode<E: Application, Caps>(
    scenario: &Scenario<E, Caps>,
) -> Result<(), DynError> {
    if !matches!(scenario.cluster_mode(), ClusterMode::ExistingCluster) {
        return Ok(());
    }

    let cluster = scenario
        .existing_cluster()
        .ok_or_else(|| DynError::from("existing-cluster mode requires an existing cluster"))?;

    ensure_compose_existing_cluster(cluster)
}

fn ensure_compose_existing_cluster(cluster: &ExistingCluster) -> Result<(), DynError> {
    if cluster.compose_project().is_some() && cluster.compose_services().is_some() {
        return Ok(());
    }

    Err("compose deployer requires a compose existing-cluster descriptor".into())
}

#[cfg(test)]
mod tests {
    use testing_framework_core::scenario::ExistingCluster;

    use super::ensure_compose_existing_cluster;

    #[test]
    fn compose_cluster_validator_accepts_compose_descriptor() {
        ensure_compose_existing_cluster(&ExistingCluster::for_compose_project(
            "project".to_owned(),
        ))
        .expect("compose descriptor should be accepted");
    }

    #[test]
    fn compose_cluster_validator_rejects_k8s_descriptor() {
        let error = ensure_compose_existing_cluster(&ExistingCluster::for_k8s_selector(
            "app=node".to_owned(),
        ))
        .expect_err("k8s descriptor should be rejected");

        assert_eq!(
            error.to_string(),
            "compose deployer requires a compose existing-cluster descriptor"
        );
    }
}
=======
>>>>>>> 5baf93d (Align attach wording with existing-cluster mode)
fn existing_cluster_metadata<E, Caps>(scenario: &Scenario<E, Caps>) -> ComposeDeploymentMetadata
where
    E: ComposeDeployEnv,
    Caps: Send + Sync,
{
    ComposeDeploymentMetadata::from_existing_cluster(scenario.existing_cluster())
}

struct DeployedNodes<E: ComposeDeployEnv> {
    host_ports: HostPortMapping,
    host: String,
    node_clients: NodeClients<E>,
    client_builder: ClientBuilder<E>,
}

struct ComposeRuntime<E: ComposeDeployEnv> {
    assembly: RuntimeAssembly<E>,
    feed_task: FeedHandle,
}

struct RuntimeBuildInput<'a, E: ComposeDeployEnv> {
    deployed: &'a DeployedNodes<E>,
    descriptors: E::Deployment,
    duration: Duration,
    expectation_cooldown: Duration,
    cluster_control_profile: ClusterControlProfile,
    telemetry: Metrics,
    environment: &'a mut StackEnvironment,
    node_control: Option<Arc<dyn NodeControlHandle<E>>>,
    cluster_wait: Arc<dyn ClusterWaitHandle<E>>,
}

async fn build_compose_runtime<E: ComposeDeployEnv>(
    input: RuntimeBuildInput<'_, E>,
) -> Result<ComposeRuntime<E>, ComposeRunnerError> {
    let node_clients = input.deployed.node_clients.clone();
    if node_clients.is_empty() {
        return Err(ComposeRunnerError::RuntimePreflight);
    }

    let (feed, feed_task) = input
        .deployed
        .client_builder
        .start_block_feed(&node_clients, input.environment)
        .await?;

    let assembly = build_runtime_assembly(
        input.descriptors,
        node_clients,
        input.duration,
        input.expectation_cooldown,
        input.cluster_control_profile,
        input.telemetry,
        feed,
        input.node_control,
        input.cluster_wait,
    );

    Ok(ComposeRuntime {
        assembly,
        feed_task,
    })
}

async fn deploy_nodes<E: ComposeDeployEnv>(
    environment: &mut StackEnvironment,
    descriptors: &E::Deployment,
    readiness_enabled: bool,
    readiness_requirement: HttpReadinessRequirement,
) -> Result<DeployedNodes<E>, ComposeRunnerError> {
    let host_ports = PortManager::<E>::prepare(environment, descriptors).await?;
    wait_for_readiness_or_grace_period::<E>(
        readiness_enabled,
        descriptors,
        readiness_requirement,
        &host_ports,
        environment,
    )
    .await?;

    let host = compose_runner_host();
    let client_builder = ClientBuilder::<E>::new();
    let node_clients = client_builder
        .build_node_clients(descriptors, &host_ports, &host, environment)
        .await?;

    Ok(DeployedNodes {
        host_ports,
        host,
        node_clients,
        client_builder,
    })
}

fn build_runtime_assembly<E: ComposeDeployEnv>(
    descriptors: E::Deployment,
    node_clients: NodeClients<E>,
    run_duration: Duration,
    expectation_cooldown: Duration,
    cluster_control_profile: ClusterControlProfile,
    telemetry: Metrics,
    feed: <E::FeedRuntime as FeedRuntime>::Feed,
    node_control: Option<Arc<dyn NodeControlHandle<E>>>,
    cluster_wait: Arc<dyn ClusterWaitHandle<E>>,
) -> RuntimeAssembly<E> {
    let mut assembly = RuntimeAssembly::new(
        descriptors,
        node_clients,
        run_duration,
        expectation_cooldown,
        cluster_control_profile,
        telemetry,
        feed,
    )
    .with_cluster_wait(cluster_wait);

    if let Some(node_control) = node_control {
        assembly = assembly.with_node_control(node_control);
    }

    assembly
}

fn resolve_observability_inputs<E, Caps>(
    scenario: &Scenario<E, Caps>,
) -> Result<ObservabilityInputs, ComposeRunnerError>
where
    Caps: ObservabilityCapabilityProvider,
    E: ComposeDeployEnv,
{
    let env_inputs = ObservabilityInputs::from_env()?;
    let cap_inputs = scenario
        .capabilities()
        .observability_capability()
        .map(ObservabilityInputs::from_capability)
        .unwrap_or_default();

    Ok(env_inputs.with_overrides(cap_inputs))
}

async fn wait_for_readiness_or_grace_period<E: ComposeDeployEnv>(
    readiness_checks: bool,
    descriptors: &E::Deployment,
    readiness_requirement: HttpReadinessRequirement,
    host_ports: &HostPortMapping,
    environment: &mut StackEnvironment,
) -> Result<(), ComposeRunnerError> {
    if readiness_checks {
        ReadinessChecker::<E>::wait_all(
            descriptors,
            host_ports,
            readiness_requirement,
            environment,
        )
        .await?;
        return Ok(());
    }

    info!("readiness checks disabled; giving the stack a short grace period");
    crate::lifecycle::readiness::maybe_sleep_for_disabled_readiness(false).await;
    Ok(())
}

fn log_observability_endpoints(observability: &ObservabilityInputs) {
    if let Some(url) = observability.metrics_query_url.as_ref() {
        info!(
            metrics_query_url = %url.as_str(),
            "metrics query endpoint configured"
        );
    }

    if let Some(url) = observability.grafana_url.as_ref() {
        info!(grafana_url = %url.as_str(), "grafana url configured");
    }
}

fn maybe_print_endpoints(observability: &ObservabilityInputs, host: &str, ports: &HostPortMapping) {
    if !should_print_endpoints() {
        return;
    }

    let prometheus = endpoint_or_disabled(observability.metrics_query_url.as_ref());
    let grafana = endpoint_or_disabled(observability.grafana_url.as_ref());

    println!(
        "TESTNET_ENDPOINTS prometheus={} grafana={}",
        prometheus, grafana
    );

    print_profiling_urls(host, ports);
}

fn should_print_endpoints() -> bool {
    env::var(PRINT_ENDPOINTS_ENV).is_ok()
}

fn endpoint_or_disabled(endpoint: Option<&Url>) -> String {
    endpoint.map_or_else(|| "<disabled>".to_string(), |url| url.as_str().to_string())
}

fn log_profiling_urls(host: &str, ports: &HostPortMapping) {
    for (idx, node) in ports.nodes.iter().enumerate() {
        info!(
            node = idx,
            profiling_url = %profiling_url(host, node.api),
            "node profiling endpoint (profiling feature required)"
        );
    }
}

fn print_profiling_urls(host: &str, ports: &HostPortMapping) {
    for (idx, node) in ports.nodes.iter().enumerate() {
        println!(
            "TESTNET_PPROF node_{}={}",
            idx,
            profiling_url(host, node.api)
        );
    }
}

fn profiling_url(host: &str, api_port: u16) -> String {
    format!("http://{host}:{api_port}/debug/pprof/profile?seconds=15&format=proto")
}

struct PreparedDeployment<E: ComposeDeployEnv> {
    environment: StackEnvironment,
    descriptors: E::Deployment,
}

async fn prepare_deployment<E: ComposeDeployEnv>(
    setup: DeploymentSetup<'_, E>,
    observability: &ObservabilityInputs,
) -> Result<PreparedDeployment<E>, ComposeRunnerError> {
    let DeploymentContext {
        environment,
        descriptors,
    } = setup.prepare_workspace(observability).await?;

    Ok(PreparedDeployment {
        environment,
        descriptors: descriptors.clone(),
    })
}