//! Project-level advisory file lock for setup. Dual-rail: `fs2::FileExt::try_lock_exclusive`
//! + PID/start_time sentinel JSON. Sentinel handles sandbox/NFS/container edge cases
//! where flock alone is unreliable.
//!
//! Acquire order:
//! 1. Read `.atomcode/.setup.lock.sentinel` if present. If recorded PID is alive **and**
//!    its start_time matches, return [`LockError::Held`] (unless `force = true`).
//!    Stale sentinel is removed.
//! 2. `try_lock_exclusive` on `.atomcode/.setup.lock` — second rail.
//! 3. Write a fresh sentinel JSON with current PID, start_time, host, version.
//!
//! Drop releases both rails (unlock fs2, rm sentinel) but keeps the `.setup.lock`
//! file so future opens reuse the inode.

use fs2::FileExt;
use serde::{Deserialize, Serialize};
use std::fs::{File, OpenOptions};
use std::io::Write;
use std::path::{Path, PathBuf};
use thiserror::Error;

const LOCK_FILE: &str = ".setup.lock";
const SENTINEL_FILE: &str = ".setup.lock.sentinel";

#[derive(Debug, Error)]
pub enum LockError {
    #[error(
        "Setup is already running (PID {pid} @ {host}, started {start_time}). Use --force to override."
    )]
    Held {
        pid: u32,
        start_time: String,
        host: String,
    },
    #[error("Lock acquisition io error: {0}")]
    Io(#[from] std::io::Error),
}

#[derive(Debug)]
pub struct SetupLock {
    fd: File,
    /// Path to the project's `.atomcode/.setup.lock` file. Kept around for
    /// diagnostics and future callers (e.g. error messages, force-cleanup CLI).
    #[allow(dead_code)]
    pub(super) lock_path: PathBuf,
    pub(super) sentinel_path: PathBuf,
}

#[derive(Debug, Serialize, Deserialize)]
struct Sentinel {
    pid: u32,
    start_time_nanos: u128,
    host: String,
    atomcode_version: String,
}

fn lock_dir(project_root: &Path) -> PathBuf {
    project_root.join(".atomcode")
}

fn current_pid() -> u32 {
    std::process::id()
}

fn hostname() -> String {
    sysinfo::System::host_name().unwrap_or_else(|| "unknown".to_string())
}

/// Returns the current process's start_time as nanoseconds since UNIX epoch.
/// sysinfo's `start_time()` returns seconds (u64); multiply to nanos for finer-grained
/// future-proofing (Linux clocktick granularity is jiffy-ish, but the JSON field
/// stays uniform regardless of OS).
fn current_start_time_nanos() -> u128 {
    use sysinfo::{Pid, ProcessRefreshKind, ProcessesToUpdate, System};
    let pid = Pid::from_u32(current_pid());
    let mut sys = System::new();
    sys.refresh_processes_specifics(ProcessesToUpdate::Some(&[pid]), false, ProcessRefreshKind::new());
    sys.process(pid)
        .map(|p| (p.start_time() as u128) * 1_000_000_000)
        .unwrap_or(0)
}

fn read_sentinel(path: &Path) -> Option<Sentinel> {
    let raw = std::fs::read_to_string(path).ok()?;
    serde_json::from_str(&raw).ok()
}

/// True iff a process with `pid` is currently running **and** its observed
/// start_time (nanos) equals `start_time_nanos`. PID reuse after the previous
/// setup crashed will not falsely report alive because start_time differs.
fn process_alive_at(pid: u32, start_time_nanos: u128) -> bool {
    use sysinfo::{Pid, ProcessRefreshKind, ProcessesToUpdate, System};
    let target = Pid::from_u32(pid);
    let mut sys = System::new();
    sys.refresh_processes_specifics(ProcessesToUpdate::Some(&[target]), false, ProcessRefreshKind::new());
    match sys.process(target) {
        Some(p) => (p.start_time() as u128) * 1_000_000_000 == start_time_nanos,
        None => false,
    }
}

impl SetupLock {
    pub fn acquire(project_root: &Path, force: bool) -> Result<Self, LockError> {
        let dir = lock_dir(project_root);
        std::fs::create_dir_all(&dir)?;
        let lock_path = dir.join(LOCK_FILE);
        let sentinel_path = dir.join(SENTINEL_FILE);

        // Stage 1: inspect sentinel (primary rail).
        //
        // - If recorded owner is **alive** and !force: report Held with full identity.
        // - If recorded owner is **alive** and force: do NOT delete sentinel yet; let
        //   fs2 be the authority. If fs2 also held, force genuinely cannot take over
        //   (peer still running). If fs2 is releasable, the alive-check raced and the
        //   peer just exited — proceed with takeover (re-read in Stage 2 surfaces who).
        // - If recorded owner is **stale** (dead PID or start_time mismatch): delete
        //   sentinel so fs2 isn't confused by a leftover file.
        let sentinel_owner: Option<Sentinel> = read_sentinel(&sentinel_path);
        let owner_alive = sentinel_owner
            .as_ref()
            .is_some_and(|s| process_alive_at(s.pid, s.start_time_nanos));

        if let Some(meta) = sentinel_owner.as_ref() {
            if owner_alive && !force {
                return Err(LockError::Held {
                    pid: meta.pid,
                    start_time: format!("{} ns", meta.start_time_nanos),
                    host: meta.host.clone(),
                });
            }
            if !owner_alive {
                // Stale — clean up so fs2 won't see a leftover file from prior crash.
                let _ = std::fs::remove_file(&sentinel_path);
            }
            // owner_alive && force: keep sentinel for now; fs2 is the authority.
        }

        // Stage 2: fs2 try_lock_exclusive (secondary rail).
        let fd = OpenOptions::new()
            .create(true)
            .read(true)
            .write(true)
            .truncate(false)
            .open(&lock_path)?;

        if fd.try_lock_exclusive().is_err() {
            // fs2 failed. Re-read sentinel to surface the *real* holder identity.
            // Covers two race/edge cases:
            //   (a) TOCTOU between our stale-sentinel removal and our fs2 attempt:
            //       a sibling wrote a fresh sentinel + grabbed fs2 in the gap.
            //   (b) force=true but the live sibling still holds fs2 — force cannot
            //       take over a running peer; report the real PID so the user knows
            //       whom to kill.
            let live_owner = read_sentinel(&sentinel_path);
            return Err(match live_owner {
                Some(meta) => LockError::Held {
                    pid: meta.pid,
                    start_time: format!("{} ns", meta.start_time_nanos),
                    host: meta.host,
                },
                None => LockError::Held {
                    pid: 0,
                    start_time: "concurrent (sentinel missing/corrupt)".to_string(),
                    host: hostname(),
                },
            });
        }

        // Stage 3: we hold both rails. If force was used against a previously-alive
        // owner, the peer must have released fs2 between Stage 1 and Stage 2 — warn
        // so the operator knows takeover actually fired (the previous version warned
        // unconditionally before fs2 succeeded, which was misleading on failure).
        if force && owner_alive {
            if let Some(meta) = sentinel_owner.as_ref() {
                tracing::warn!(
                    pid = meta.pid,
                    host = %meta.host,
                    "forced setup lock takeover after sibling released fs2 lock"
                );
            }
            // Clean up the prior owner's sentinel before we write our own.
            let _ = std::fs::remove_file(&sentinel_path);
        }

        // Stage 4: write our sentinel.
        let sentinel = Sentinel {
            pid: current_pid(),
            start_time_nanos: current_start_time_nanos(),
            host: hostname(),
            atomcode_version: env!("CARGO_PKG_VERSION").to_string(),
        };
        let json = serde_json::to_string(&sentinel).expect("Sentinel serialize never fails");
        let mut f = File::create(&sentinel_path)?;
        f.write_all(json.as_bytes())?;
        f.sync_all()?;

        Ok(Self { fd, lock_path, sentinel_path })
    }
}

impl Drop for SetupLock {
    fn drop(&mut self) {
        // Best-effort: errors during drop are intentionally swallowed. If unlock
        // fails the OS will release on process exit; sentinel removal failure
        // just leaves a stale file the next acquire will overwrite.
        let _ = fs2::FileExt::unlock(&self.fd);
        let _ = std::fs::remove_file(&self.sentinel_path);
        // Keep `.setup.lock` file itself so future opens reuse the inode.
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn acquire_creates_lock_in_fresh_project() {
        let dir = tempfile::tempdir().unwrap();
        let lock = SetupLock::acquire(dir.path(), false).unwrap();
        assert!(lock.lock_path.exists());
        assert!(lock.sentinel_path.exists());
    }

    #[test]
    fn second_acquire_fails_when_first_held() {
        let dir = tempfile::tempdir().unwrap();
        let _lock1 = SetupLock::acquire(dir.path(), false).unwrap();
        let err = SetupLock::acquire(dir.path(), false).unwrap_err();
        assert!(matches!(err, LockError::Held { .. }));
    }

    #[test]
    fn drop_releases_lock_so_next_acquire_succeeds() {
        let dir = tempfile::tempdir().unwrap();
        {
            let _lock1 = SetupLock::acquire(dir.path(), false).unwrap();
        }
        // drop happened; next acquire should succeed
        let _lock2 = SetupLock::acquire(dir.path(), false).unwrap();
    }

    #[test]
    fn force_with_alive_holder_still_fails_if_fs2_held() {
        // First lock takes both sentinel + fs2.
        let dir = tempfile::tempdir().unwrap();
        let _lock1 = SetupLock::acquire(dir.path(), false).unwrap();

        // Force=true cannot succeed when fs2 is genuinely held by the live sibling.
        // Holder PID should be reported as our own pid (since we wrote the sentinel ourselves).
        let err = SetupLock::acquire(dir.path(), true).unwrap_err();
        match err {
            LockError::Held { pid, .. } => {
                assert_eq!(pid, std::process::id(), "Held should surface real holder pid, not 0");
            }
            other => panic!("expected Held, got {other:?}"),
        }
    }

    #[test]
    fn fs2_race_loses_holder_identity_gracefully() {
        // This is hard to truly race in a unit test, but we can simulate the
        // post-condition: a sentinel exists and we attempt to acquire without force.
        // Verify the error carries the sentinel's identity, not pid=0.
        let dir = tempfile::tempdir().unwrap();
        let _lock1 = SetupLock::acquire(dir.path(), false).unwrap();
        let err = SetupLock::acquire(dir.path(), false).unwrap_err();
        match err {
            LockError::Held { pid, .. } => {
                assert_eq!(pid, std::process::id());
            }
            other => panic!("expected Held with real pid, got {other:?}"),
        }
    }
}