Auto-retry on transient failures + Run Now button for manual triggers

This commit is contained in:
2026-04-13 14:20:16 +00:00
parent d28143ec00
commit d29555ae53
3 changed files with 85 additions and 9 deletions
+38 -9
View File
@@ -3,9 +3,10 @@
import json
import os
import sys
import time
from datetime import datetime
from zoneinfo import ZoneInfo
from urllib import request
from urllib import request, error as urlerror
MT = ZoneInfo("America/Denver")
@@ -21,26 +22,54 @@ MONTH_NAMES = [
"July", "August", "September", "October", "November", "December",
]
# Retry config
DEFAULT_RETRIES = 3
DEFAULT_BACKOFF = 2 # seconds, doubles each retry
RETRIABLE_CODES = {408, 429, 500, 502, 503, 504}
def api_request(url, data=None, headers=None, method="GET"):
"""Simple HTTP helper using urllib."""
def api_request(url, data=None, headers=None, method="GET", retries=DEFAULT_RETRIES, backoff=DEFAULT_BACKOFF):
"""HTTP helper with automatic retry on transient failures."""
if data is not None:
data = json.dumps(data).encode("utf-8")
req = request.Request(url, data=data, headers=headers or {}, method=method)
if data:
req.add_header("Content-Type", "application/json")
with request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read().decode())
last_error = None
for attempt in range(retries + 1):
try:
req = request.Request(url, data=data, headers=headers or {}, method=method)
if data:
req.add_header("Content-Type", "application/json")
with request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read().decode())
except urlerror.HTTPError as e:
last_error = e
if e.code in RETRIABLE_CODES and attempt < retries:
wait = backoff * (2 ** attempt)
print(f" Retry {attempt + 1}/{retries} after {e.code} from {url} (waiting {wait}s)", file=sys.stderr)
time.sleep(wait)
continue
raise
except (urlerror.URLError, TimeoutError, ConnectionError, OSError) as e:
last_error = e
if attempt < retries:
wait = backoff * (2 ** attempt)
print(f" Retry {attempt + 1}/{retries} after {type(e).__name__} from {url} (waiting {wait}s)", file=sys.stderr)
time.sleep(wait)
continue
raise
raise last_error
def log_run(agent_id, status, output="", err="", metadata=None, instance_id=None):
"""Log a run to the dashboard API. Uses instance_id if available (v2), falls back to agent_id."""
"""Log a run to the dashboard API."""
try:
if instance_id:
api_request(
f"{DASHBOARD_API}/api/instances/{instance_id}/runs",
data={"status": status, "output": output, "error": err, "metadata": metadata or {}},
method="POST",
retries=1, # Don't retry logging too aggressively
)
else:
print(f"Warning: no instance_id, run not logged for {agent_id}", file=sys.stderr)
+33
View File
@@ -448,6 +448,39 @@ def delete_instance(instance_id: int, user: dict = Depends(require_auth), db: Se
return {"status": "deleted"}
@app.post("/api/instances/{instance_id}/trigger")
def trigger_instance(instance_id: int, user: dict = Depends(require_auth), db: Session = Depends(get_db)):
"""Trigger a manual run of an agent instance. Runs async via subprocess."""
inst = db.query(AgentInstance).filter(
AgentInstance.id == instance_id, AgentInstance.user_id == user["user_id"]
).first()
if not inst:
raise HTTPException(status_code=404)
# Determine which script to run based on catalog type and user
catalog_id = inst.catalog_id
u = db.query(User).filter(User.id == user["user_id"]).first()
if catalog_id == "daily-briefing":
# Find the user's briefing wrapper script or use the generic engine
import subprocess
env = {**dict(os.environ), f"{u.username.upper().replace('.','_')}_INSTANCE_ID": str(instance_id)}
# Check for user-specific script
script_map = {
"eric": "eric_briefing.py",
"angela": "angela_briefing.py",
}
script = script_map.get(u.username, None)
if script:
cmd = f"cd /opt/agent-dashboard/agents && {u.username.upper().replace('.','_')}_INSTANCE_ID={instance_id} python3 {script}"
else:
cmd = f"cd /opt/agent-dashboard/agents && python3 -c \"from daily_briefing import run; run({{'person': '{u.display_name}', 'agent_id': '{catalog_id}', 'instance_id': {instance_id}, 'wiki_parent_doc_id': '', 'location': {{}}}})\""
subprocess.Popen(cmd, shell=True, env=env)
return {"status": "triggered", "message": f"Running {catalog_id} for {u.display_name}"}
return {"status": "error", "message": f"Manual trigger not supported for {catalog_id} yet"}
# --- Internal endpoints (no auth, for agent scripts) ---
@app.get("/api/instances/{instance_id}/config")
+14
View File
@@ -206,6 +206,7 @@ function buildConfigForm(inst){
html+=`<div class="config-actions">
<button class="btn-save" onclick="saveInstanceConfig(${inst.id})">Save</button>
<button class="btn-secondary" onclick="triggerRun(${inst.id})">Run Now</button>
<button class="btn-danger" onclick="deleteInstance(${inst.id})">Delete</button>
<span class="save-msg" id="save-msg"></span>
</div></div>`;
@@ -286,6 +287,19 @@ async function saveInstanceConfig(id){
else{msg.textContent='Error';msg.style.color='var(--red)'}
}
async function triggerRun(id){
const msg=document.getElementById('save-msg');
msg.textContent='Running...';msg.style.color='var(--blue)';
const res=await fetch(API+'/api/instances/'+id+'/trigger',{method:'POST'});
if(res.ok){
const data=await res.json();
msg.textContent=data.message||'Triggered';msg.style.color='var(--green)';
setTimeout(()=>{msg.textContent='';refresh()},5000);
} else {
msg.textContent='Failed to trigger';msg.style.color='var(--red)';
}
}
async function deleteInstance(id){
if(!confirm('Delete this agent instance and all its runs?'))return;
await fetch(API+'/api/instances/'+id,{method:'DELETE'});