mirror of
https://github.com/affaan-m/everything-claude-code.git
synced 2026-04-16 23:23:29 +08:00
feat(ecc2): add session heartbeat stale detection
This commit is contained in:
@@ -22,10 +22,8 @@ pub async fn run(db: StateStore, cfg: Config) -> Result<()> {
|
||||
resume_crashed_sessions(&db)?;
|
||||
|
||||
let heartbeat_interval = Duration::from_secs(cfg.heartbeat_interval_secs);
|
||||
let timeout = Duration::from_secs(cfg.session_timeout_secs);
|
||||
|
||||
loop {
|
||||
if let Err(e) = check_sessions(&db, timeout) {
|
||||
if let Err(e) = check_sessions(&db, &cfg) {
|
||||
tracing::error!("Session check failed: {e}");
|
||||
}
|
||||
|
||||
@@ -82,25 +80,8 @@ where
|
||||
Ok(failed_sessions)
|
||||
}
|
||||
|
||||
fn check_sessions(db: &StateStore, timeout: Duration) -> Result<()> {
|
||||
let sessions = db.list_sessions()?;
|
||||
|
||||
for session in sessions {
|
||||
if session.state != SessionState::Running {
|
||||
continue;
|
||||
}
|
||||
|
||||
let elapsed = chrono::Utc::now()
|
||||
.signed_duration_since(session.updated_at)
|
||||
.to_std()
|
||||
.unwrap_or(Duration::ZERO);
|
||||
|
||||
if elapsed > timeout {
|
||||
tracing::warn!("Session {} timed out after {:?}", session.id, elapsed);
|
||||
db.update_state_and_pid(&session.id, &SessionState::Failed, None)?;
|
||||
}
|
||||
}
|
||||
|
||||
fn check_sessions(db: &StateStore, cfg: &Config) -> Result<()> {
|
||||
let _ = manager::enforce_session_heartbeats(db, cfg)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -498,6 +479,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
last_heartbeat_at: now,
|
||||
metrics: SessionMetrics::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -68,6 +68,58 @@ pub fn get_team_status(db: &StateStore, id: &str, depth: usize) -> Result<TeamSt
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize)]
|
||||
pub struct HeartbeatEnforcementOutcome {
|
||||
pub stale_sessions: Vec<String>,
|
||||
pub auto_terminated_sessions: Vec<String>,
|
||||
}
|
||||
|
||||
pub fn enforce_session_heartbeats(
|
||||
db: &StateStore,
|
||||
cfg: &Config,
|
||||
) -> Result<HeartbeatEnforcementOutcome> {
|
||||
enforce_session_heartbeats_with(db, cfg, kill_process)
|
||||
}
|
||||
|
||||
fn enforce_session_heartbeats_with<F>(
|
||||
db: &StateStore,
|
||||
cfg: &Config,
|
||||
terminate_pid: F,
|
||||
) -> Result<HeartbeatEnforcementOutcome>
|
||||
where
|
||||
F: Fn(u32) -> Result<()>,
|
||||
{
|
||||
let timeout = chrono::Duration::seconds(cfg.session_timeout_secs as i64);
|
||||
let now = chrono::Utc::now();
|
||||
let mut outcome = HeartbeatEnforcementOutcome::default();
|
||||
|
||||
for session in db.list_sessions()? {
|
||||
if !matches!(session.state, SessionState::Running | SessionState::Stale) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if now.signed_duration_since(session.last_heartbeat_at) <= timeout {
|
||||
continue;
|
||||
}
|
||||
|
||||
if cfg.auto_terminate_stale_sessions {
|
||||
if let Some(pid) = session.pid {
|
||||
let _ = terminate_pid(pid);
|
||||
}
|
||||
db.update_state_and_pid(&session.id, &SessionState::Failed, None)?;
|
||||
outcome.auto_terminated_sessions.push(session.id);
|
||||
continue;
|
||||
}
|
||||
|
||||
if session.state != SessionState::Stale {
|
||||
db.update_state(&session.id, &SessionState::Stale)?;
|
||||
outcome.stale_sessions.push(session.id);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(outcome)
|
||||
}
|
||||
|
||||
pub async fn assign_session(
|
||||
db: &StateStore,
|
||||
cfg: &Config,
|
||||
@@ -685,7 +737,7 @@ pub async fn merge_session_worktree(
|
||||
|
||||
if matches!(
|
||||
session.state,
|
||||
SessionState::Pending | SessionState::Running | SessionState::Idle
|
||||
SessionState::Pending | SessionState::Running | SessionState::Idle | SessionState::Stale
|
||||
) {
|
||||
anyhow::bail!(
|
||||
"Cannot merge active session {} while it is {}",
|
||||
@@ -747,7 +799,10 @@ pub async fn merge_ready_worktrees(
|
||||
|
||||
if matches!(
|
||||
session.state,
|
||||
SessionState::Pending | SessionState::Running | SessionState::Idle
|
||||
SessionState::Pending
|
||||
| SessionState::Running
|
||||
| SessionState::Idle
|
||||
| SessionState::Stale
|
||||
) {
|
||||
active_with_worktree_ids.push(session.id);
|
||||
continue;
|
||||
@@ -902,6 +957,7 @@ pub async fn run_session(
|
||||
session_id.to_string(),
|
||||
command,
|
||||
SessionOutputStore::default(),
|
||||
std::time::Duration::from_secs(cfg.heartbeat_interval_secs),
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
@@ -997,6 +1053,7 @@ fn build_session_record(
|
||||
worktree,
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
last_heartbeat_at: now,
|
||||
metrics: SessionMetrics::default(),
|
||||
})
|
||||
}
|
||||
@@ -1488,6 +1545,15 @@ impl fmt::Display for SessionStatus {
|
||||
writeln!(f, "Tools: {}", s.metrics.tool_calls)?;
|
||||
writeln!(f, "Files: {}", s.metrics.files_changed)?;
|
||||
writeln!(f, "Cost: ${:.4}", s.metrics.cost_usd)?;
|
||||
writeln!(
|
||||
f,
|
||||
"Heartbeat: {} ({}s ago)",
|
||||
s.last_heartbeat_at,
|
||||
chrono::Utc::now()
|
||||
.signed_duration_since(s.last_heartbeat_at)
|
||||
.num_seconds()
|
||||
.max(0)
|
||||
)?;
|
||||
if !self.delegated_children.is_empty() {
|
||||
writeln!(f, "Children: {}", self.delegated_children.join(", "))?;
|
||||
}
|
||||
@@ -1528,6 +1594,7 @@ impl fmt::Display for TeamStatus {
|
||||
for lane in [
|
||||
"Running",
|
||||
"Idle",
|
||||
"Stale",
|
||||
"Pending",
|
||||
"Failed",
|
||||
"Stopped",
|
||||
@@ -1676,6 +1743,7 @@ fn session_state_label(state: &SessionState) -> &'static str {
|
||||
SessionState::Pending => "Pending",
|
||||
SessionState::Running => "Running",
|
||||
SessionState::Idle => "Idle",
|
||||
SessionState::Stale => "Stale",
|
||||
SessionState::Completed => "Completed",
|
||||
SessionState::Failed => "Failed",
|
||||
SessionState::Stopped => "Stopped",
|
||||
@@ -1727,6 +1795,7 @@ mod tests {
|
||||
max_parallel_worktrees: 4,
|
||||
session_timeout_secs: 60,
|
||||
heartbeat_interval_secs: 5,
|
||||
auto_terminate_stale_sessions: false,
|
||||
default_agent: "claude".to_string(),
|
||||
auto_dispatch_unread_handoffs: false,
|
||||
auto_dispatch_limit_per_session: 5,
|
||||
@@ -1755,10 +1824,85 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: updated_at - Duration::minutes(1),
|
||||
updated_at,
|
||||
last_heartbeat_at: updated_at,
|
||||
metrics: SessionMetrics::default(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn enforce_session_heartbeats_marks_overdue_running_sessions_stale() -> Result<()> {
|
||||
let tempdir = TestDir::new("manager-heartbeat-stale")?;
|
||||
let cfg = build_config(tempdir.path());
|
||||
let db = StateStore::open(&cfg.db_path)?;
|
||||
let now = Utc::now();
|
||||
|
||||
db.insert_session(&Session {
|
||||
id: "stale-1".to_string(),
|
||||
task: "heartbeat overdue".to_string(),
|
||||
agent_type: "claude".to_string(),
|
||||
working_dir: PathBuf::from("/tmp"),
|
||||
state: SessionState::Running,
|
||||
pid: Some(4242),
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(5),
|
||||
updated_at: now - Duration::minutes(5),
|
||||
last_heartbeat_at: now - Duration::minutes(5),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
|
||||
let outcome = enforce_session_heartbeats(&db, &cfg)?;
|
||||
let session = db.get_session("stale-1")?.expect("session should exist");
|
||||
|
||||
assert_eq!(outcome.stale_sessions, vec!["stale-1".to_string()]);
|
||||
assert!(outcome.auto_terminated_sessions.is_empty());
|
||||
assert_eq!(session.state, SessionState::Stale);
|
||||
assert_eq!(session.pid, Some(4242));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn enforce_session_heartbeats_auto_terminates_when_enabled() -> Result<()> {
|
||||
let tempdir = TestDir::new("manager-heartbeat-terminate")?;
|
||||
let mut cfg = build_config(tempdir.path());
|
||||
cfg.auto_terminate_stale_sessions = true;
|
||||
let db = StateStore::open(&cfg.db_path)?;
|
||||
let now = Utc::now();
|
||||
let killed = std::sync::Arc::new(std::sync::Mutex::new(Vec::new()));
|
||||
let killed_clone = killed.clone();
|
||||
|
||||
db.insert_session(&Session {
|
||||
id: "stale-2".to_string(),
|
||||
task: "terminate overdue".to_string(),
|
||||
agent_type: "claude".to_string(),
|
||||
working_dir: PathBuf::from("/tmp"),
|
||||
state: SessionState::Running,
|
||||
pid: Some(7777),
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(5),
|
||||
updated_at: now - Duration::minutes(5),
|
||||
last_heartbeat_at: now - Duration::minutes(5),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
|
||||
let outcome = enforce_session_heartbeats_with(&db, &cfg, move |pid| {
|
||||
killed_clone.lock().unwrap().push(pid);
|
||||
Ok(())
|
||||
})?;
|
||||
let session = db.get_session("stale-2")?.expect("session should exist");
|
||||
|
||||
assert!(outcome.stale_sessions.is_empty());
|
||||
assert_eq!(
|
||||
outcome.auto_terminated_sessions,
|
||||
vec!["stale-2".to_string()]
|
||||
);
|
||||
assert_eq!(*killed.lock().unwrap(), vec![7777]);
|
||||
assert_eq!(session.state, SessionState::Failed);
|
||||
assert_eq!(session.pid, None);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn build_daemon_activity() -> super::super::store::DaemonActivity {
|
||||
let now = Utc::now();
|
||||
super::super::store::DaemonActivity {
|
||||
@@ -1976,6 +2120,7 @@ mod tests {
|
||||
}),
|
||||
created_at: now - Duration::minutes(1),
|
||||
updated_at: now,
|
||||
last_heartbeat_at: now,
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
db.update_metrics(
|
||||
@@ -2032,6 +2177,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(2),
|
||||
updated_at: now - Duration::minutes(1),
|
||||
last_heartbeat_at: now - Duration::minutes(1),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
db.update_metrics(
|
||||
@@ -2076,6 +2222,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(1),
|
||||
updated_at: now,
|
||||
last_heartbeat_at: now,
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
|
||||
@@ -2328,6 +2475,7 @@ mod tests {
|
||||
worktree: Some(merged_worktree.clone()),
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
last_heartbeat_at: now,
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
|
||||
@@ -2343,6 +2491,7 @@ mod tests {
|
||||
worktree: Some(active_worktree.clone()),
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
last_heartbeat_at: now,
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
|
||||
@@ -2359,6 +2508,7 @@ mod tests {
|
||||
worktree: Some(dirty_worktree.clone()),
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
last_heartbeat_at: now,
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
|
||||
@@ -2584,6 +2734,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(2),
|
||||
updated_at: now - Duration::minutes(2),
|
||||
last_heartbeat_at: now - Duration::minutes(2),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
db.insert_session(&Session {
|
||||
@@ -2596,6 +2747,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(1),
|
||||
updated_at: now - Duration::minutes(1),
|
||||
last_heartbeat_at: now - Duration::minutes(1),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
db.send_message(
|
||||
@@ -2651,6 +2803,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(3),
|
||||
updated_at: now - Duration::minutes(3),
|
||||
last_heartbeat_at: now - Duration::minutes(3),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
db.insert_session(&Session {
|
||||
@@ -2663,6 +2816,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(2),
|
||||
updated_at: now - Duration::minutes(2),
|
||||
last_heartbeat_at: now - Duration::minutes(2),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
db.send_message(
|
||||
@@ -2727,6 +2881,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(3),
|
||||
updated_at: now - Duration::minutes(3),
|
||||
last_heartbeat_at: now - Duration::minutes(3),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
db.insert_session(&Session {
|
||||
@@ -2739,6 +2894,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(2),
|
||||
updated_at: now - Duration::minutes(2),
|
||||
last_heartbeat_at: now - Duration::minutes(2),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
db.send_message(
|
||||
@@ -2794,6 +2950,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(3),
|
||||
updated_at: now - Duration::minutes(3),
|
||||
last_heartbeat_at: now - Duration::minutes(3),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
db.insert_session(&Session {
|
||||
@@ -2806,6 +2963,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(2),
|
||||
updated_at: now - Duration::minutes(2),
|
||||
last_heartbeat_at: now - Duration::minutes(2),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
db.send_message(
|
||||
@@ -2865,6 +3023,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(3),
|
||||
updated_at: now - Duration::minutes(3),
|
||||
last_heartbeat_at: now - Duration::minutes(3),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
db.insert_session(&Session {
|
||||
@@ -2877,6 +3036,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(2),
|
||||
updated_at: now - Duration::minutes(2),
|
||||
last_heartbeat_at: now - Duration::minutes(2),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
db.send_message(
|
||||
@@ -2930,6 +3090,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(3),
|
||||
updated_at: now - Duration::minutes(3),
|
||||
last_heartbeat_at: now - Duration::minutes(3),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
|
||||
@@ -2977,6 +3138,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(3),
|
||||
updated_at: now - Duration::minutes(3),
|
||||
last_heartbeat_at: now - Duration::minutes(3),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
db.insert_session(&Session {
|
||||
@@ -2989,6 +3151,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(2),
|
||||
updated_at: now - Duration::minutes(2),
|
||||
last_heartbeat_at: now - Duration::minutes(2),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
db.send_message(
|
||||
@@ -3044,6 +3207,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(3),
|
||||
updated_at: now - Duration::minutes(3),
|
||||
last_heartbeat_at: now - Duration::minutes(3),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
}
|
||||
@@ -3103,6 +3267,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(3),
|
||||
updated_at: now - Duration::minutes(3),
|
||||
last_heartbeat_at: now - Duration::minutes(3),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
}
|
||||
@@ -3154,6 +3319,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(3),
|
||||
updated_at: now - Duration::minutes(3),
|
||||
last_heartbeat_at: now - Duration::minutes(3),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
|
||||
@@ -3167,6 +3333,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(2),
|
||||
updated_at: now - Duration::minutes(2),
|
||||
last_heartbeat_at: now - Duration::minutes(2),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
|
||||
@@ -3222,6 +3389,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(4),
|
||||
updated_at: now - Duration::minutes(4),
|
||||
last_heartbeat_at: now - Duration::minutes(4),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
db.insert_session(&Session {
|
||||
@@ -3234,6 +3402,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(3),
|
||||
updated_at: now - Duration::minutes(3),
|
||||
last_heartbeat_at: now - Duration::minutes(3),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
db.insert_session(&Session {
|
||||
@@ -3246,6 +3415,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(2),
|
||||
updated_at: now - Duration::minutes(2),
|
||||
last_heartbeat_at: now - Duration::minutes(2),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
|
||||
@@ -3307,6 +3477,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(4),
|
||||
updated_at: now - Duration::minutes(4),
|
||||
last_heartbeat_at: now - Duration::minutes(4),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
db.insert_session(&Session {
|
||||
@@ -3319,6 +3490,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - Duration::minutes(3),
|
||||
updated_at: now - Duration::minutes(3),
|
||||
last_heartbeat_at: now - Duration::minutes(3),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
|
||||
|
||||
@@ -20,6 +20,7 @@ pub struct Session {
|
||||
pub worktree: Option<WorktreeInfo>,
|
||||
pub created_at: DateTime<Utc>,
|
||||
pub updated_at: DateTime<Utc>,
|
||||
pub last_heartbeat_at: DateTime<Utc>,
|
||||
pub metrics: SessionMetrics,
|
||||
}
|
||||
|
||||
@@ -28,6 +29,7 @@ pub enum SessionState {
|
||||
Pending,
|
||||
Running,
|
||||
Idle,
|
||||
Stale,
|
||||
Completed,
|
||||
Failed,
|
||||
Stopped,
|
||||
@@ -39,6 +41,7 @@ impl fmt::Display for SessionState {
|
||||
SessionState::Pending => write!(f, "pending"),
|
||||
SessionState::Running => write!(f, "running"),
|
||||
SessionState::Idle => write!(f, "idle"),
|
||||
SessionState::Stale => write!(f, "stale"),
|
||||
SessionState::Completed => write!(f, "completed"),
|
||||
SessionState::Failed => write!(f, "failed"),
|
||||
SessionState::Stopped => write!(f, "stopped"),
|
||||
@@ -60,12 +63,21 @@ impl SessionState {
|
||||
) | (
|
||||
SessionState::Running,
|
||||
SessionState::Idle
|
||||
| SessionState::Stale
|
||||
| SessionState::Completed
|
||||
| SessionState::Failed
|
||||
| SessionState::Stopped
|
||||
) | (
|
||||
SessionState::Idle,
|
||||
SessionState::Running
|
||||
| SessionState::Stale
|
||||
| SessionState::Completed
|
||||
| SessionState::Failed
|
||||
| SessionState::Stopped
|
||||
) | (
|
||||
SessionState::Stale,
|
||||
SessionState::Running
|
||||
| SessionState::Idle
|
||||
| SessionState::Completed
|
||||
| SessionState::Failed
|
||||
| SessionState::Stopped
|
||||
@@ -78,6 +90,7 @@ impl SessionState {
|
||||
match value {
|
||||
"running" => SessionState::Running,
|
||||
"idle" => SessionState::Idle,
|
||||
"stale" => SessionState::Stale,
|
||||
"completed" => SessionState::Completed,
|
||||
"failed" => SessionState::Failed,
|
||||
"stopped" => SessionState::Stopped,
|
||||
|
||||
@@ -5,6 +5,7 @@ use anyhow::{Context, Result};
|
||||
use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader};
|
||||
use tokio::process::Command;
|
||||
use tokio::sync::{mpsc, oneshot};
|
||||
use tokio::time::{self, MissedTickBehavior};
|
||||
|
||||
use super::output::{OutputStream, SessionOutputStore};
|
||||
use super::store::StateStore;
|
||||
@@ -26,6 +27,9 @@ enum DbMessage {
|
||||
line: String,
|
||||
ack: oneshot::Sender<DbAck>,
|
||||
},
|
||||
TouchHeartbeat {
|
||||
ack: oneshot::Sender<DbAck>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -53,6 +57,10 @@ impl DbWriter {
|
||||
.await
|
||||
}
|
||||
|
||||
async fn touch_heartbeat(&self) -> Result<()> {
|
||||
self.send(|ack| DbMessage::TouchHeartbeat { ack }).await
|
||||
}
|
||||
|
||||
async fn send<F>(&self, build: F) -> Result<()>
|
||||
where
|
||||
F: FnOnce(oneshot::Sender<DbAck>) -> DbMessage,
|
||||
@@ -111,6 +119,17 @@ fn run_db_writer(db_path: PathBuf, session_id: String, mut rx: mpsc::UnboundedRe
|
||||
};
|
||||
let _ = ack.send(result);
|
||||
}
|
||||
DbMessage::TouchHeartbeat { ack } => {
|
||||
let result = match opened.as_ref() {
|
||||
Some(db) => db
|
||||
.touch_heartbeat(&session_id)
|
||||
.map_err(|error| error.to_string()),
|
||||
None => Err(open_error
|
||||
.clone()
|
||||
.unwrap_or_else(|| "Failed to open state store".to_string())),
|
||||
};
|
||||
let _ = ack.send(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -120,6 +139,7 @@ pub async fn capture_command_output(
|
||||
session_id: String,
|
||||
mut command: Command,
|
||||
output_store: SessionOutputStore,
|
||||
heartbeat_interval: std::time::Duration,
|
||||
) -> Result<ExitStatus> {
|
||||
let db_writer = DbWriter::start(db_path, session_id.clone());
|
||||
|
||||
@@ -152,6 +172,19 @@ pub async fn capture_command_output(
|
||||
.ok_or_else(|| anyhow::anyhow!("Spawned process did not expose a process id"))?;
|
||||
db_writer.update_pid(Some(pid)).await?;
|
||||
db_writer.update_state(SessionState::Running).await?;
|
||||
db_writer.touch_heartbeat().await?;
|
||||
|
||||
let heartbeat_writer = db_writer.clone();
|
||||
let heartbeat_task = tokio::spawn(async move {
|
||||
let mut ticker = time::interval(heartbeat_interval);
|
||||
ticker.set_missed_tick_behavior(MissedTickBehavior::Delay);
|
||||
loop {
|
||||
ticker.tick().await;
|
||||
if heartbeat_writer.touch_heartbeat().await.is_err() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
let stdout_task = tokio::spawn(capture_stream(
|
||||
session_id.clone(),
|
||||
@@ -169,6 +202,8 @@ pub async fn capture_command_output(
|
||||
));
|
||||
|
||||
let status = child.wait().await?;
|
||||
heartbeat_task.abort();
|
||||
let _ = heartbeat_task.await;
|
||||
stdout_task.await??;
|
||||
stderr_task.await??;
|
||||
|
||||
@@ -244,6 +279,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
last_heartbeat_at: now,
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
|
||||
@@ -254,9 +290,14 @@ mod tests {
|
||||
.arg("-c")
|
||||
.arg("printf 'alpha\\n'; printf 'beta\\n' >&2");
|
||||
|
||||
let status =
|
||||
capture_command_output(db_path.clone(), session_id.clone(), command, output_store)
|
||||
.await?;
|
||||
let status = capture_command_output(
|
||||
db_path.clone(),
|
||||
session_id.clone(),
|
||||
command,
|
||||
output_store,
|
||||
std::time::Duration::from_millis(10),
|
||||
)
|
||||
.await?;
|
||||
|
||||
assert!(status.success());
|
||||
|
||||
@@ -286,4 +327,49 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn capture_command_output_updates_heartbeat_for_quiet_processes() -> Result<()> {
|
||||
let db_path = env::temp_dir().join(format!("ecc2-runtime-heartbeat-{}.db", Uuid::new_v4()));
|
||||
let db = StateStore::open(&db_path)?;
|
||||
let session_id = "session-heartbeat".to_string();
|
||||
let now = Utc::now();
|
||||
|
||||
db.insert_session(&Session {
|
||||
id: session_id.clone(),
|
||||
task: "quiet process".to_string(),
|
||||
agent_type: "test".to_string(),
|
||||
working_dir: env::temp_dir(),
|
||||
state: SessionState::Pending,
|
||||
pid: None,
|
||||
worktree: None,
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
last_heartbeat_at: now,
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
|
||||
let mut command = Command::new("/bin/sh");
|
||||
command.arg("-c").arg("sleep 0.05");
|
||||
|
||||
let _ = capture_command_output(
|
||||
db_path.clone(),
|
||||
session_id.clone(),
|
||||
command,
|
||||
SessionOutputStore::default(),
|
||||
std::time::Duration::from_millis(10),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let db = StateStore::open(&db_path)?;
|
||||
let session = db
|
||||
.get_session(&session_id)?
|
||||
.expect("session should still exist");
|
||||
|
||||
assert!(session.last_heartbeat_at > now);
|
||||
assert_eq!(session.state, SessionState::Completed);
|
||||
|
||||
let _ = std::fs::remove_file(db_path);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -132,7 +132,8 @@ impl StateStore {
|
||||
duration_secs INTEGER DEFAULT 0,
|
||||
cost_usd REAL DEFAULT 0.0,
|
||||
created_at TEXT NOT NULL,
|
||||
updated_at TEXT NOT NULL
|
||||
updated_at TEXT NOT NULL,
|
||||
last_heartbeat_at TEXT NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS tool_log (
|
||||
@@ -240,6 +241,20 @@ impl StateStore {
|
||||
.context("Failed to add output_tokens column to sessions table")?;
|
||||
}
|
||||
|
||||
if !self.has_column("sessions", "last_heartbeat_at")? {
|
||||
self.conn
|
||||
.execute("ALTER TABLE sessions ADD COLUMN last_heartbeat_at TEXT", [])
|
||||
.context("Failed to add last_heartbeat_at column to sessions table")?;
|
||||
self.conn
|
||||
.execute(
|
||||
"UPDATE sessions
|
||||
SET last_heartbeat_at = updated_at
|
||||
WHERE last_heartbeat_at IS NULL",
|
||||
[],
|
||||
)
|
||||
.context("Failed to backfill last_heartbeat_at column")?;
|
||||
}
|
||||
|
||||
if !self.has_column("tool_log", "hook_event_id")? {
|
||||
self.conn
|
||||
.execute("ALTER TABLE tool_log ADD COLUMN hook_event_id TEXT", [])
|
||||
@@ -404,8 +419,8 @@ impl StateStore {
|
||||
|
||||
pub fn insert_session(&self, session: &Session) -> Result<()> {
|
||||
self.conn.execute(
|
||||
"INSERT INTO sessions (id, task, agent_type, working_dir, state, pid, worktree_path, worktree_branch, worktree_base, created_at, updated_at)
|
||||
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11)",
|
||||
"INSERT INTO sessions (id, task, agent_type, working_dir, state, pid, worktree_path, worktree_branch, worktree_base, created_at, updated_at, last_heartbeat_at)
|
||||
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12)",
|
||||
rusqlite::params![
|
||||
session.id,
|
||||
session.task,
|
||||
@@ -421,6 +436,7 @@ impl StateStore {
|
||||
session.worktree.as_ref().map(|w| w.base_branch.clone()),
|
||||
session.created_at.to_rfc3339(),
|
||||
session.updated_at.to_rfc3339(),
|
||||
session.last_heartbeat_at.to_rfc3339(),
|
||||
],
|
||||
)?;
|
||||
Ok(())
|
||||
@@ -433,7 +449,12 @@ impl StateStore {
|
||||
pid: Option<u32>,
|
||||
) -> Result<()> {
|
||||
let updated = self.conn.execute(
|
||||
"UPDATE sessions SET state = ?1, pid = ?2, updated_at = ?3 WHERE id = ?4",
|
||||
"UPDATE sessions
|
||||
SET state = ?1,
|
||||
pid = ?2,
|
||||
updated_at = ?3,
|
||||
last_heartbeat_at = ?3
|
||||
WHERE id = ?4",
|
||||
rusqlite::params![
|
||||
state.to_string(),
|
||||
pid.map(i64::from),
|
||||
@@ -470,7 +491,11 @@ impl StateStore {
|
||||
}
|
||||
|
||||
let updated = self.conn.execute(
|
||||
"UPDATE sessions SET state = ?1, updated_at = ?2 WHERE id = ?3",
|
||||
"UPDATE sessions
|
||||
SET state = ?1,
|
||||
updated_at = ?2,
|
||||
last_heartbeat_at = ?2
|
||||
WHERE id = ?3",
|
||||
rusqlite::params![
|
||||
state.to_string(),
|
||||
chrono::Utc::now().to_rfc3339(),
|
||||
@@ -487,7 +512,11 @@ impl StateStore {
|
||||
|
||||
pub fn update_pid(&self, session_id: &str, pid: Option<u32>) -> Result<()> {
|
||||
let updated = self.conn.execute(
|
||||
"UPDATE sessions SET pid = ?1, updated_at = ?2 WHERE id = ?3",
|
||||
"UPDATE sessions
|
||||
SET pid = ?1,
|
||||
updated_at = ?2,
|
||||
last_heartbeat_at = ?2
|
||||
WHERE id = ?3",
|
||||
rusqlite::params![
|
||||
pid.map(i64::from),
|
||||
chrono::Utc::now().to_rfc3339(),
|
||||
@@ -505,7 +534,11 @@ impl StateStore {
|
||||
pub fn clear_worktree(&self, session_id: &str) -> Result<()> {
|
||||
let updated = self.conn.execute(
|
||||
"UPDATE sessions
|
||||
SET worktree_path = NULL, worktree_branch = NULL, worktree_base = NULL, updated_at = ?1
|
||||
SET worktree_path = NULL,
|
||||
worktree_branch = NULL,
|
||||
worktree_base = NULL,
|
||||
updated_at = ?1,
|
||||
last_heartbeat_at = ?1
|
||||
WHERE id = ?2",
|
||||
rusqlite::params![chrono::Utc::now().to_rfc3339(), session_id],
|
||||
)?;
|
||||
@@ -571,7 +604,10 @@ impl StateStore {
|
||||
.unwrap_or_default()
|
||||
.with_timezone(&chrono::Utc);
|
||||
let effective_end = match state {
|
||||
SessionState::Pending | SessionState::Running | SessionState::Idle => now,
|
||||
SessionState::Pending
|
||||
| SessionState::Running
|
||||
| SessionState::Idle
|
||||
| SessionState::Stale => now,
|
||||
SessionState::Completed | SessionState::Failed | SessionState::Stopped => {
|
||||
updated_at
|
||||
}
|
||||
@@ -592,6 +628,20 @@ impl StateStore {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn touch_heartbeat(&self, session_id: &str) -> Result<()> {
|
||||
let now = chrono::Utc::now().to_rfc3339();
|
||||
let updated = self.conn.execute(
|
||||
"UPDATE sessions SET last_heartbeat_at = ?1 WHERE id = ?2",
|
||||
rusqlite::params![now, session_id],
|
||||
)?;
|
||||
|
||||
if updated == 0 {
|
||||
anyhow::bail!("Session not found: {session_id}");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn sync_cost_tracker_metrics(&self, metrics_path: &Path) -> Result<()> {
|
||||
if !metrics_path.exists() {
|
||||
return Ok(());
|
||||
@@ -786,7 +836,11 @@ impl StateStore {
|
||||
|
||||
pub fn increment_tool_calls(&self, session_id: &str) -> Result<()> {
|
||||
self.conn.execute(
|
||||
"UPDATE sessions SET tool_calls = tool_calls + 1, updated_at = ?1 WHERE id = ?2",
|
||||
"UPDATE sessions
|
||||
SET tool_calls = tool_calls + 1,
|
||||
updated_at = ?1,
|
||||
last_heartbeat_at = ?1
|
||||
WHERE id = ?2",
|
||||
rusqlite::params![chrono::Utc::now().to_rfc3339(), session_id],
|
||||
)?;
|
||||
Ok(())
|
||||
@@ -796,7 +850,7 @@ impl StateStore {
|
||||
let mut stmt = self.conn.prepare(
|
||||
"SELECT id, task, agent_type, working_dir, state, pid, worktree_path, worktree_branch, worktree_base,
|
||||
input_tokens, output_tokens, tokens_used, tool_calls, files_changed, duration_secs, cost_usd,
|
||||
created_at, updated_at
|
||||
created_at, updated_at, last_heartbeat_at
|
||||
FROM sessions ORDER BY updated_at DESC",
|
||||
)?;
|
||||
|
||||
@@ -814,6 +868,7 @@ impl StateStore {
|
||||
|
||||
let created_str: String = row.get(16)?;
|
||||
let updated_str: String = row.get(17)?;
|
||||
let heartbeat_str: String = row.get(18)?;
|
||||
|
||||
Ok(Session {
|
||||
id: row.get(0)?,
|
||||
@@ -829,6 +884,11 @@ impl StateStore {
|
||||
updated_at: chrono::DateTime::parse_from_rfc3339(&updated_str)
|
||||
.unwrap_or_default()
|
||||
.with_timezone(&chrono::Utc),
|
||||
last_heartbeat_at: chrono::DateTime::parse_from_rfc3339(&heartbeat_str)
|
||||
.unwrap_or_else(|_| {
|
||||
chrono::DateTime::parse_from_rfc3339(&updated_str).unwrap_or_default()
|
||||
})
|
||||
.with_timezone(&chrono::Utc),
|
||||
metrics: SessionMetrics {
|
||||
input_tokens: row.get(9)?,
|
||||
output_tokens: row.get(10)?,
|
||||
@@ -1299,7 +1359,10 @@ impl StateStore {
|
||||
)?;
|
||||
|
||||
self.conn.execute(
|
||||
"UPDATE sessions SET updated_at = ?1 WHERE id = ?2",
|
||||
"UPDATE sessions
|
||||
SET updated_at = ?1,
|
||||
last_heartbeat_at = ?1
|
||||
WHERE id = ?2",
|
||||
rusqlite::params![chrono::Utc::now().to_rfc3339(), session_id],
|
||||
)?;
|
||||
|
||||
@@ -1460,6 +1523,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - ChronoDuration::minutes(1),
|
||||
updated_at: now,
|
||||
last_heartbeat_at: now,
|
||||
metrics: SessionMetrics::default(),
|
||||
}
|
||||
}
|
||||
@@ -1520,6 +1584,9 @@ mod tests {
|
||||
assert!(column_names.iter().any(|column| column == "pid"));
|
||||
assert!(column_names.iter().any(|column| column == "input_tokens"));
|
||||
assert!(column_names.iter().any(|column| column == "output_tokens"));
|
||||
assert!(column_names
|
||||
.iter()
|
||||
.any(|column| column == "last_heartbeat_at"));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1539,6 +1606,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
last_heartbeat_at: now,
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
|
||||
@@ -1583,6 +1651,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
last_heartbeat_at: now,
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
db.insert_session(&Session {
|
||||
@@ -1595,6 +1664,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
last_heartbeat_at: now,
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
|
||||
@@ -1648,6 +1718,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - ChronoDuration::seconds(95),
|
||||
updated_at: now - ChronoDuration::seconds(1),
|
||||
last_heartbeat_at: now - ChronoDuration::seconds(1),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
db.insert_session(&Session {
|
||||
@@ -1660,6 +1731,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now - ChronoDuration::seconds(80),
|
||||
updated_at: now - ChronoDuration::seconds(5),
|
||||
last_heartbeat_at: now - ChronoDuration::seconds(5),
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
|
||||
@@ -1678,6 +1750,36 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn touch_heartbeat_updates_last_heartbeat_timestamp() -> Result<()> {
|
||||
let tempdir = TestDir::new("store-touch-heartbeat")?;
|
||||
let db = StateStore::open(&tempdir.path().join("state.db"))?;
|
||||
let now = Utc::now() - ChronoDuration::seconds(30);
|
||||
|
||||
db.insert_session(&Session {
|
||||
id: "session-1".to_string(),
|
||||
task: "heartbeat".to_string(),
|
||||
agent_type: "claude".to_string(),
|
||||
working_dir: PathBuf::from("/tmp"),
|
||||
state: SessionState::Running,
|
||||
pid: Some(1234),
|
||||
worktree: None,
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
last_heartbeat_at: now,
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
|
||||
db.touch_heartbeat("session-1")?;
|
||||
|
||||
let session = db
|
||||
.get_session("session-1")?
|
||||
.expect("session should still exist");
|
||||
assert!(session.last_heartbeat_at > now);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn append_output_line_keeps_latest_buffer_window() -> Result<()> {
|
||||
let tempdir = TestDir::new("store-output")?;
|
||||
@@ -1694,6 +1796,7 @@ mod tests {
|
||||
worktree: None,
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
last_heartbeat_at: now,
|
||||
metrics: SessionMetrics::default(),
|
||||
})?;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user