Auto-retry on transient failures + Run Now button for manual triggers

This commit is contained in:
2026-04-13 14:20:16 +00:00
parent d28143ec00
commit d29555ae53
3 changed files with 85 additions and 9 deletions
+33 -4
View File
@@ -3,9 +3,10 @@
import json import json
import os import os
import sys import sys
import time
from datetime import datetime from datetime import datetime
from zoneinfo import ZoneInfo from zoneinfo import ZoneInfo
from urllib import request from urllib import request, error as urlerror
MT = ZoneInfo("America/Denver") MT = ZoneInfo("America/Denver")
@@ -21,26 +22,54 @@ MONTH_NAMES = [
"July", "August", "September", "October", "November", "December", "July", "August", "September", "October", "November", "December",
] ]
# Retry config
DEFAULT_RETRIES = 3
DEFAULT_BACKOFF = 2 # seconds, doubles each retry
RETRIABLE_CODES = {408, 429, 500, 502, 503, 504}
def api_request(url, data=None, headers=None, method="GET"):
"""Simple HTTP helper using urllib.""" def api_request(url, data=None, headers=None, method="GET", retries=DEFAULT_RETRIES, backoff=DEFAULT_BACKOFF):
"""HTTP helper with automatic retry on transient failures."""
if data is not None: if data is not None:
data = json.dumps(data).encode("utf-8") data = json.dumps(data).encode("utf-8")
last_error = None
for attempt in range(retries + 1):
try:
req = request.Request(url, data=data, headers=headers or {}, method=method) req = request.Request(url, data=data, headers=headers or {}, method=method)
if data: if data:
req.add_header("Content-Type", "application/json") req.add_header("Content-Type", "application/json")
with request.urlopen(req, timeout=30) as resp: with request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read().decode()) return json.loads(resp.read().decode())
except urlerror.HTTPError as e:
last_error = e
if e.code in RETRIABLE_CODES and attempt < retries:
wait = backoff * (2 ** attempt)
print(f" Retry {attempt + 1}/{retries} after {e.code} from {url} (waiting {wait}s)", file=sys.stderr)
time.sleep(wait)
continue
raise
except (urlerror.URLError, TimeoutError, ConnectionError, OSError) as e:
last_error = e
if attempt < retries:
wait = backoff * (2 ** attempt)
print(f" Retry {attempt + 1}/{retries} after {type(e).__name__} from {url} (waiting {wait}s)", file=sys.stderr)
time.sleep(wait)
continue
raise
raise last_error
def log_run(agent_id, status, output="", err="", metadata=None, instance_id=None): def log_run(agent_id, status, output="", err="", metadata=None, instance_id=None):
"""Log a run to the dashboard API. Uses instance_id if available (v2), falls back to agent_id.""" """Log a run to the dashboard API."""
try: try:
if instance_id: if instance_id:
api_request( api_request(
f"{DASHBOARD_API}/api/instances/{instance_id}/runs", f"{DASHBOARD_API}/api/instances/{instance_id}/runs",
data={"status": status, "output": output, "error": err, "metadata": metadata or {}}, data={"status": status, "output": output, "error": err, "metadata": metadata or {}},
method="POST", method="POST",
retries=1, # Don't retry logging too aggressively
) )
else: else:
print(f"Warning: no instance_id, run not logged for {agent_id}", file=sys.stderr) print(f"Warning: no instance_id, run not logged for {agent_id}", file=sys.stderr)
+33
View File
@@ -448,6 +448,39 @@ def delete_instance(instance_id: int, user: dict = Depends(require_auth), db: Se
return {"status": "deleted"} return {"status": "deleted"}
@app.post("/api/instances/{instance_id}/trigger")
def trigger_instance(instance_id: int, user: dict = Depends(require_auth), db: Session = Depends(get_db)):
"""Trigger a manual run of an agent instance. Runs async via subprocess."""
inst = db.query(AgentInstance).filter(
AgentInstance.id == instance_id, AgentInstance.user_id == user["user_id"]
).first()
if not inst:
raise HTTPException(status_code=404)
# Determine which script to run based on catalog type and user
catalog_id = inst.catalog_id
u = db.query(User).filter(User.id == user["user_id"]).first()
if catalog_id == "daily-briefing":
# Find the user's briefing wrapper script or use the generic engine
import subprocess
env = {**dict(os.environ), f"{u.username.upper().replace('.','_')}_INSTANCE_ID": str(instance_id)}
# Check for user-specific script
script_map = {
"eric": "eric_briefing.py",
"angela": "angela_briefing.py",
}
script = script_map.get(u.username, None)
if script:
cmd = f"cd /opt/agent-dashboard/agents && {u.username.upper().replace('.','_')}_INSTANCE_ID={instance_id} python3 {script}"
else:
cmd = f"cd /opt/agent-dashboard/agents && python3 -c \"from daily_briefing import run; run({{'person': '{u.display_name}', 'agent_id': '{catalog_id}', 'instance_id': {instance_id}, 'wiki_parent_doc_id': '', 'location': {{}}}})\""
subprocess.Popen(cmd, shell=True, env=env)
return {"status": "triggered", "message": f"Running {catalog_id} for {u.display_name}"}
return {"status": "error", "message": f"Manual trigger not supported for {catalog_id} yet"}
# --- Internal endpoints (no auth, for agent scripts) --- # --- Internal endpoints (no auth, for agent scripts) ---
@app.get("/api/instances/{instance_id}/config") @app.get("/api/instances/{instance_id}/config")
+14
View File
@@ -206,6 +206,7 @@ function buildConfigForm(inst){
html+=`<div class="config-actions"> html+=`<div class="config-actions">
<button class="btn-save" onclick="saveInstanceConfig(${inst.id})">Save</button> <button class="btn-save" onclick="saveInstanceConfig(${inst.id})">Save</button>
<button class="btn-secondary" onclick="triggerRun(${inst.id})">Run Now</button>
<button class="btn-danger" onclick="deleteInstance(${inst.id})">Delete</button> <button class="btn-danger" onclick="deleteInstance(${inst.id})">Delete</button>
<span class="save-msg" id="save-msg"></span> <span class="save-msg" id="save-msg"></span>
</div></div>`; </div></div>`;
@@ -286,6 +287,19 @@ async function saveInstanceConfig(id){
else{msg.textContent='Error';msg.style.color='var(--red)'} else{msg.textContent='Error';msg.style.color='var(--red)'}
} }
async function triggerRun(id){
const msg=document.getElementById('save-msg');
msg.textContent='Running...';msg.style.color='var(--blue)';
const res=await fetch(API+'/api/instances/'+id+'/trigger',{method:'POST'});
if(res.ok){
const data=await res.json();
msg.textContent=data.message||'Triggered';msg.style.color='var(--green)';
setTimeout(()=>{msg.textContent='';refresh()},5000);
} else {
msg.textContent='Failed to trigger';msg.style.color='var(--red)';
}
}
async function deleteInstance(id){ async function deleteInstance(id){
if(!confirm('Delete this agent instance and all its runs?'))return; if(!confirm('Delete this agent instance and all its runs?'))return;
await fetch(API+'/api/instances/'+id,{method:'DELETE'}); await fetch(API+'/api/instances/'+id,{method:'DELETE'});