Auto-retry on transient failures + Run Now button for manual triggers
This commit is contained in:
+38
-9
@@ -3,9 +3,10 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from zoneinfo import ZoneInfo
|
from zoneinfo import ZoneInfo
|
||||||
from urllib import request
|
from urllib import request, error as urlerror
|
||||||
|
|
||||||
MT = ZoneInfo("America/Denver")
|
MT = ZoneInfo("America/Denver")
|
||||||
|
|
||||||
@@ -21,26 +22,54 @@ MONTH_NAMES = [
|
|||||||
"July", "August", "September", "October", "November", "December",
|
"July", "August", "September", "October", "November", "December",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Retry config
|
||||||
|
DEFAULT_RETRIES = 3
|
||||||
|
DEFAULT_BACKOFF = 2 # seconds, doubles each retry
|
||||||
|
RETRIABLE_CODES = {408, 429, 500, 502, 503, 504}
|
||||||
|
|
||||||
def api_request(url, data=None, headers=None, method="GET"):
|
|
||||||
"""Simple HTTP helper using urllib."""
|
def api_request(url, data=None, headers=None, method="GET", retries=DEFAULT_RETRIES, backoff=DEFAULT_BACKOFF):
|
||||||
|
"""HTTP helper with automatic retry on transient failures."""
|
||||||
if data is not None:
|
if data is not None:
|
||||||
data = json.dumps(data).encode("utf-8")
|
data = json.dumps(data).encode("utf-8")
|
||||||
req = request.Request(url, data=data, headers=headers or {}, method=method)
|
|
||||||
if data:
|
last_error = None
|
||||||
req.add_header("Content-Type", "application/json")
|
for attempt in range(retries + 1):
|
||||||
with request.urlopen(req, timeout=30) as resp:
|
try:
|
||||||
return json.loads(resp.read().decode())
|
req = request.Request(url, data=data, headers=headers or {}, method=method)
|
||||||
|
if data:
|
||||||
|
req.add_header("Content-Type", "application/json")
|
||||||
|
with request.urlopen(req, timeout=30) as resp:
|
||||||
|
return json.loads(resp.read().decode())
|
||||||
|
except urlerror.HTTPError as e:
|
||||||
|
last_error = e
|
||||||
|
if e.code in RETRIABLE_CODES and attempt < retries:
|
||||||
|
wait = backoff * (2 ** attempt)
|
||||||
|
print(f" Retry {attempt + 1}/{retries} after {e.code} from {url} (waiting {wait}s)", file=sys.stderr)
|
||||||
|
time.sleep(wait)
|
||||||
|
continue
|
||||||
|
raise
|
||||||
|
except (urlerror.URLError, TimeoutError, ConnectionError, OSError) as e:
|
||||||
|
last_error = e
|
||||||
|
if attempt < retries:
|
||||||
|
wait = backoff * (2 ** attempt)
|
||||||
|
print(f" Retry {attempt + 1}/{retries} after {type(e).__name__} from {url} (waiting {wait}s)", file=sys.stderr)
|
||||||
|
time.sleep(wait)
|
||||||
|
continue
|
||||||
|
raise
|
||||||
|
|
||||||
|
raise last_error
|
||||||
|
|
||||||
|
|
||||||
def log_run(agent_id, status, output="", err="", metadata=None, instance_id=None):
|
def log_run(agent_id, status, output="", err="", metadata=None, instance_id=None):
|
||||||
"""Log a run to the dashboard API. Uses instance_id if available (v2), falls back to agent_id."""
|
"""Log a run to the dashboard API."""
|
||||||
try:
|
try:
|
||||||
if instance_id:
|
if instance_id:
|
||||||
api_request(
|
api_request(
|
||||||
f"{DASHBOARD_API}/api/instances/{instance_id}/runs",
|
f"{DASHBOARD_API}/api/instances/{instance_id}/runs",
|
||||||
data={"status": status, "output": output, "error": err, "metadata": metadata or {}},
|
data={"status": status, "output": output, "error": err, "metadata": metadata or {}},
|
||||||
method="POST",
|
method="POST",
|
||||||
|
retries=1, # Don't retry logging too aggressively
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
print(f"Warning: no instance_id, run not logged for {agent_id}", file=sys.stderr)
|
print(f"Warning: no instance_id, run not logged for {agent_id}", file=sys.stderr)
|
||||||
|
|||||||
@@ -448,6 +448,39 @@ def delete_instance(instance_id: int, user: dict = Depends(require_auth), db: Se
|
|||||||
return {"status": "deleted"}
|
return {"status": "deleted"}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/instances/{instance_id}/trigger")
|
||||||
|
def trigger_instance(instance_id: int, user: dict = Depends(require_auth), db: Session = Depends(get_db)):
|
||||||
|
"""Trigger a manual run of an agent instance. Runs async via subprocess."""
|
||||||
|
inst = db.query(AgentInstance).filter(
|
||||||
|
AgentInstance.id == instance_id, AgentInstance.user_id == user["user_id"]
|
||||||
|
).first()
|
||||||
|
if not inst:
|
||||||
|
raise HTTPException(status_code=404)
|
||||||
|
|
||||||
|
# Determine which script to run based on catalog type and user
|
||||||
|
catalog_id = inst.catalog_id
|
||||||
|
u = db.query(User).filter(User.id == user["user_id"]).first()
|
||||||
|
|
||||||
|
if catalog_id == "daily-briefing":
|
||||||
|
# Find the user's briefing wrapper script or use the generic engine
|
||||||
|
import subprocess
|
||||||
|
env = {**dict(os.environ), f"{u.username.upper().replace('.','_')}_INSTANCE_ID": str(instance_id)}
|
||||||
|
# Check for user-specific script
|
||||||
|
script_map = {
|
||||||
|
"eric": "eric_briefing.py",
|
||||||
|
"angela": "angela_briefing.py",
|
||||||
|
}
|
||||||
|
script = script_map.get(u.username, None)
|
||||||
|
if script:
|
||||||
|
cmd = f"cd /opt/agent-dashboard/agents && {u.username.upper().replace('.','_')}_INSTANCE_ID={instance_id} python3 {script}"
|
||||||
|
else:
|
||||||
|
cmd = f"cd /opt/agent-dashboard/agents && python3 -c \"from daily_briefing import run; run({{'person': '{u.display_name}', 'agent_id': '{catalog_id}', 'instance_id': {instance_id}, 'wiki_parent_doc_id': '', 'location': {{}}}})\""
|
||||||
|
subprocess.Popen(cmd, shell=True, env=env)
|
||||||
|
return {"status": "triggered", "message": f"Running {catalog_id} for {u.display_name}"}
|
||||||
|
|
||||||
|
return {"status": "error", "message": f"Manual trigger not supported for {catalog_id} yet"}
|
||||||
|
|
||||||
|
|
||||||
# --- Internal endpoints (no auth, for agent scripts) ---
|
# --- Internal endpoints (no auth, for agent scripts) ---
|
||||||
|
|
||||||
@app.get("/api/instances/{instance_id}/config")
|
@app.get("/api/instances/{instance_id}/config")
|
||||||
|
|||||||
@@ -206,6 +206,7 @@ function buildConfigForm(inst){
|
|||||||
|
|
||||||
html+=`<div class="config-actions">
|
html+=`<div class="config-actions">
|
||||||
<button class="btn-save" onclick="saveInstanceConfig(${inst.id})">Save</button>
|
<button class="btn-save" onclick="saveInstanceConfig(${inst.id})">Save</button>
|
||||||
|
<button class="btn-secondary" onclick="triggerRun(${inst.id})">Run Now</button>
|
||||||
<button class="btn-danger" onclick="deleteInstance(${inst.id})">Delete</button>
|
<button class="btn-danger" onclick="deleteInstance(${inst.id})">Delete</button>
|
||||||
<span class="save-msg" id="save-msg"></span>
|
<span class="save-msg" id="save-msg"></span>
|
||||||
</div></div>`;
|
</div></div>`;
|
||||||
@@ -286,6 +287,19 @@ async function saveInstanceConfig(id){
|
|||||||
else{msg.textContent='Error';msg.style.color='var(--red)'}
|
else{msg.textContent='Error';msg.style.color='var(--red)'}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function triggerRun(id){
|
||||||
|
const msg=document.getElementById('save-msg');
|
||||||
|
msg.textContent='Running...';msg.style.color='var(--blue)';
|
||||||
|
const res=await fetch(API+'/api/instances/'+id+'/trigger',{method:'POST'});
|
||||||
|
if(res.ok){
|
||||||
|
const data=await res.json();
|
||||||
|
msg.textContent=data.message||'Triggered';msg.style.color='var(--green)';
|
||||||
|
setTimeout(()=>{msg.textContent='';refresh()},5000);
|
||||||
|
} else {
|
||||||
|
msg.textContent='Failed to trigger';msg.style.color='var(--red)';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async function deleteInstance(id){
|
async function deleteInstance(id){
|
||||||
if(!confirm('Delete this agent instance and all its runs?'))return;
|
if(!confirm('Delete this agent instance and all its runs?'))return;
|
||||||
await fetch(API+'/api/instances/'+id,{method:'DELETE'});
|
await fetch(API+'/api/instances/'+id,{method:'DELETE'});
|
||||||
|
|||||||
Reference in New Issue
Block a user