Monitoring & Alerting
Set up comprehensive monitoring and alerting for your Donobu automation flows with webhooks, Slack, Datadog, and other monitoring platforms.
Monitoring & Alerting Overview
Implement robust monitoring and alerting systems to track your Donobu automation flows, detect issues early, and maintain high reliability.
Webhook Integration
Basic Webhook Handler
Configure webhooks to receive real-time notifications when flows complete:
// webhook-handler.js const express = require('express'); const app = express(); app.use(express.json()); app.post('/donobu-webhook', async (req, res) => { const { id } = req.body; try { // Fetch complete flow details const response = await fetch(`http://localhost:31000/api/flows/${id}`); const flow = await response.json(); // Process based on flow state await processFlowCompletion(flow); res.status(200).send('OK'); } catch (error) { console.error('Webhook processing error:', error); res.status(500).send('Error processing webhook'); } }); async function processFlowCompletion(flow) { if (flow.state === 'FAILED') { await sendCriticalAlert(flow); } else if (flow.state === 'SUCCESS') { await logSuccessMetrics(flow); } // Always log execution metrics await logFlowMetrics(flow); } app.listen(4000, () => { console.log('Webhook server running on port 4000'); });
Advanced Webhook Processing
// advanced-webhook-handler.js const express = require('express'); const crypto = require('crypto'); const app = express(); // Webhook signature verification function verifyWebhookSignature(payload, signature, secret) { const expectedSignature = crypto .createHmac('sha256', secret) .update(payload) .digest('hex'); return crypto.timingSafeEqual( Buffer.from(signature), Buffer.from(expectedSignature) ); } app.use('/donobu-webhook', express.raw({type: 'application/json'}), (req, res) => { const signature = req.headers['x-donobu-signature']; const secret = process.env.WEBHOOK_SECRET; if (!verifyWebhookSignature(req.body, signature, secret)) { return res.status(401).send('Unauthorized'); } const payload = JSON.parse(req.body); processWebhookPayload(payload); res.status(200).send('OK'); }); async function processWebhookPayload(payload) { const { id, event } = payload; switch (event) { case 'flow.completed': await handleFlowCompletion(id); break; case 'flow.failed': await handleFlowFailure(id); break; case 'flow.started': await handleFlowStart(id); break; default: console.log('Unknown webhook event:', event); } }
Slack Integration
Slack Notification Service
// slack-notifier.js const { WebClient } = require('@slack/web-api'); class SlackNotifier { constructor(token) { this.slack = new WebClient(token); } async sendFlowNotification(flow) { const emoji = this.getStatusEmoji(flow.state); const color = this.getStatusColor(flow.state); const message = { channel: '#qa-automation', text: `${emoji} Flow ${flow.state}: ${flow.name}`, attachments: [{ color: color, fields: [ { title: 'Flow ID', value: flow.id, short: true }, { title: 'Duration', value: this.formatDuration(flow.completedAt - flow.startedAt), short: true }, { title: 'Token Usage', value: `${flow.inputTokensUsed + flow.completionTokensUsed}`, short: true }, { title: 'Website', value: flow.targetWebsite, short: true } ], actions: flow.state === 'FAILED' ? [ { type: 'button', text: 'View Flow Details', url: `http://localhost:3000/flows/${flow.id}` }, { type: 'button', text: 'Download Video', url: `http://localhost:31000/api/flows/${flow.id}/video` } ] : [], footer: 'Donobu Automation', ts: Math.floor(flow.completedAt / 1000) }] }; await this.slack.chat.postMessage(message); } getStatusEmoji(state) { const emojis = { SUCCESS: '✅', FAILED: '❌', RUNNING: '🔄', PAUSED: '⏸️' }; return emojis[state] || '❓'; } getStatusColor(state) { const colors = { SUCCESS: 'good', FAILED: 'danger', RUNNING: '#439FE0', PAUSED: 'warning' }; return colors[state] || '#cccccc'; } formatDuration(milliseconds) { const seconds = Math.round(milliseconds / 1000); if (seconds < 60) return `${seconds}s`; const minutes = Math.floor(seconds / 60); const remainingSeconds = seconds % 60; return `${minutes}m ${remainingSeconds}s`; } async sendDailyReport(flows) { const successful = flows.filter(f => f.state === 'SUCCESS').length; const failed = flows.filter(f => f.state === 'FAILED').length; const total = flows.length; const message = { channel: '#qa-automation', text: '📊 Daily Automation Report', attachments: [{ color: failed > 0 ? 'warning' : 'good', fields: [ { title: 'Total Flows', value: total.toString(), short: true }, { title: 'Successful', value: `✅ ${successful}`, short: true }, { title: 'Failed', value: `❌ ${failed}`, short: true }, { title: 'Success Rate', value: `${Math.round((successful / total) * 100)}%`, short: true } ] }] }; await this.slack.chat.postMessage(message); } } module.exports = SlackNotifier;
Datadog Integration
Datadog Metrics Collection
// datadog-metrics.js const StatsD = require('hot-shots'); class DatadogMetrics { constructor() { this.dogstatsd = new StatsD({ host: 'localhost', port: 8125, prefix: 'donobu.', globalTags: { environment: process.env.NODE_ENV || 'development', service: 'donobu-automation' } }); } trackFlowCompletion(flow) { // Track flow completion with status this.dogstatsd.increment('flow.completed', 1, { flow_name: flow.name, state: flow.state, run_mode: flow.runMode, website: this.sanitizeTag(flow.targetWebsite) }); } trackFlowDuration(flow) { const duration = flow.completedAt - flow.startedAt; this.dogstatsd.histogram('flow.duration', duration, { flow_name: flow.name, run_mode: flow.runMode, state: flow.state }); } trackTokenUsage(flow) { this.dogstatsd.gauge('flow.tokens.input', flow.inputTokensUsed, { flow_name: flow.name }); this.dogstatsd.gauge('flow.tokens.completion', flow.completionTokensUsed, { flow_name: flow.name }); this.dogstatsd.gauge('flow.tokens.total', flow.inputTokensUsed + flow.completionTokensUsed, { flow_name: flow.name }); } trackToolCalls(flow) { // Get tool calls count from API fetch(`http://localhost:31000/api/flows/${flow.id}/tool-calls`) .then(response => response.json()) .then(toolCalls => { this.dogstatsd.gauge('flow.tool_calls', toolCalls.length, { flow_name: flow.name }); // Track tool usage breakdown const toolUsage = {}; toolCalls.forEach(call => { toolUsage[call.toolName] = (toolUsage[call.toolName] || 0) + 1; }); Object.entries(toolUsage).forEach(([toolName, count]) => { this.dogstatsd.gauge('flow.tool_usage', count, { flow_name: flow.name, tool_name: toolName }); }); }) .catch(error => console.error('Error fetching tool calls:', error)); } trackErrorRate() { // This would typically be called periodically fetch('http://localhost:31000/api/flows?limit=100') .then(response => response.json()) .then(data => { const flows = data.items || []; const failed = flows.filter(f => f.state === 'FAILED').length; const total = flows.length; if (total > 0) { const errorRate = (failed / total) * 100; this.dogstatsd.gauge('flow.error_rate', errorRate); } }) .catch(error => console.error('Error calculating error rate:', error)); } sanitizeTag(value) { // Remove or replace invalid characters for Datadog tags return value.replace(/[^a-zA-Z0-9_.-]/g, '_').toLowerCase(); } // Custom business metrics trackBusinessMetrics(flow) { // Example: Track conversion funnel steps if (flow.name.includes('Checkout')) { this.dogstatsd.increment('business.checkout_attempts'); if (flow.state === 'SUCCESS') { this.dogstatsd.increment('business.checkout_completed'); } } // Example: Track user onboarding if (flow.name.includes('Onboarding')) { this.dogstatsd.increment('business.onboarding_attempts'); if (flow.state === 'SUCCESS') { this.dogstatsd.increment('business.onboarding_completed'); } } } } module.exports = DatadogMetrics;
Datadog Dashboard Configuration
{ "title": "Donobu Automation Dashboard", "widgets": [ { "id": 1, "definition": { "type": "timeseries", "title": "Flow Completion Rate", "requests": [ { "q": "sum:donobu.flow.completed{state:success}.as_rate()", "display_type": "line", "style": { "palette": "green" } }, { "q": "sum:donobu.flow.completed{state:failed}.as_rate()", "display_type": "line", "style": { "palette": "red" } } ] } }, { "id": 2, "definition": { "type": "query_value", "title": "Current Error Rate", "requests": [ { "q": "avg:donobu.flow.error_rate", "aggregator": "last" } ], "precision": 2 } } ] }
Prometheus & Grafana Integration
Prometheus Metrics Exporter
// prometheus-metrics.js const client = require('prom-client'); const express = require('express'); // Create metrics const flowCompletionCounter = new client.Counter({ name: 'donobu_flows_completed_total', help: 'Total number of completed flows', labelNames: ['status', 'flow_name', 'run_mode'] }); const flowDurationHistogram = new client.Histogram({ name: 'donobu_flow_duration_seconds', help: 'Duration of flow execution in seconds', labelNames: ['flow_name', 'status'], buckets: [0.5, 1, 2, 5, 10, 30, 60, 120, 300] }); const tokenUsageGauge = new client.Gauge({ name: 'donobu_tokens_used_total', help: 'Total tokens used by flows', labelNames: ['flow_name', 'token_type'] }); const activeFlowsGauge = new client.Gauge({ name: 'donobu_active_flows', help: 'Number of currently running flows' }); class PrometheusMetrics { constructor() { // Register default metrics client.register.setDefaultLabels({ app: 'donobu-automation', environment: process.env.NODE_ENV || 'development' }); client.collectDefaultMetrics(); } recordFlowCompletion(flow) { flowCompletionCounter.inc({ status: flow.state.toLowerCase(), flow_name: flow.name, run_mode: flow.runMode }); const durationSeconds = (flow.completedAt - flow.startedAt) / 1000; flowDurationHistogram.observe({ flow_name: flow.name, status: flow.state.toLowerCase() }, durationSeconds); tokenUsageGauge.set({ flow_name: flow.name, token_type: 'input' }, flow.inputTokensUsed); tokenUsageGauge.set({ flow_name: flow.name, token_type: 'completion' }, flow.completionTokensUsed); } updateActiveFlows(count) { activeFlowsGauge.set(count); } startMetricsServer(port = 9090) { const app = express(); app.get('/metrics', async (req, res) => { res.set('Content-Type', client.register.contentType); res.end(await client.register.metrics()); }); app.listen(port, () => { console.log(`Metrics server listening on port ${port}`); }); } } module.exports = PrometheusMetrics;
Grafana Dashboard JSON
{ "dashboard": { "id": null, "title": "Donobu Automation Metrics", "tags": ["donobu", "automation"], "timezone": "browser", "panels": [ { "id": 1, "title": "Flow Success Rate", "type": "stat", "targets": [ { "expr": "rate(donobu_flows_completed_total{status=\"success\"}[5m]) / rate(donobu_flows_completed_total[5m]) * 100", "refId": "A" } ], "fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100 } } }, { "id": 2, "title": "Flow Duration Distribution", "type": "heatmap", "targets": [ { "expr": "increase(donobu_flow_duration_seconds_bucket[5m])", "refId": "A", "format": "heatmap" } ] }, { "id": 3, "title": "Active Flows", "type": "graph", "targets": [ { "expr": "donobu_active_flows", "refId": "A" } ] } ], "time": { "from": "now-1h", "to": "now" }, "refresh": "30s" } }
PagerDuty Integration
PagerDuty Alert Service
// pagerduty-alerts.js const pdClient = require('node-pagerduty'); class PagerDutyAlerts { constructor(integrationKey) { this.pd = new pdClient({ integrationKey: integrationKey }); } async sendCriticalAlert(flow) { const payload = { routing_key: this.integrationKey, event_action: 'trigger', dedup_key: `donobu-flow-${flow.id}`, payload: { summary: `Critical Flow Failure: ${flow.name}`, source: 'donobu-automation', severity: 'critical', component: 'automation-flow', group: 'e2e-testing', class: 'flow-execution', custom_details: { flow_id: flow.id, flow_name: flow.name, target_website: flow.targetWebsite, error: flow.result?.error, duration: flow.completedAt - flow.startedAt, token_usage: flow.inputTokensUsed + flow.completionTokensUsed } }, links: [ { href: `http://localhost:3000/flows/${flow.id}`, text: 'View Flow Details' } ] }; try { const response = await this.pd.events.sendEvent(payload); console.log('PagerDuty alert sent:', response.dedup_key); } catch (error) { console.error('Failed to send PagerDuty alert:', error); } } async resolveAlert(flowId) { const payload = { routing_key: this.integrationKey, event_action: 'resolve', dedup_key: `donobu-flow-${flowId}` }; try { await this.pd.events.sendEvent(payload); console.log('PagerDuty alert resolved for flow:', flowId); } catch (error) { console.error('Failed to resolve PagerDuty alert:', error); } } } module.exports = PagerDutyAlerts;
Custom Monitoring Dashboard
Health Check Service
// health-monitor.js class HealthMonitor { constructor() { this.healthChecks = new Map(); this.alertThresholds = { errorRate: 10, // Alert if error rate > 10% avgDuration: 300000, // Alert if avg duration > 5 minutes failureStreak: 3 // Alert after 3 consecutive failures }; } async performHealthCheck() { const health = { timestamp: Date.now(), status: 'healthy', checks: {} }; try { // Check API availability health.checks.api = await this.checkApiHealth(); // Check recent flow performance health.checks.flows = await this.checkFlowHealth(); // Check system resources health.checks.resources = await this.checkSystemHealth(); // Determine overall status health.status = this.calculateOverallStatus(health.checks); } catch (error) { health.status = 'unhealthy'; health.error = error.message; } return health; } async checkApiHealth() { try { const response = await fetch('http://localhost:31000/api/ping'); return { status: response.ok ? 'healthy' : 'unhealthy', responseTime: Date.now() - startTime }; } catch (error) { return { status: 'unhealthy', error: error.message }; } } async checkFlowHealth() { try { const response = await fetch('http://localhost:31000/api/flows?limit=50'); const data = await response.json(); const flows = data.items || []; const recent = flows.filter(f => f.startedAt > Date.now() - 3600000 // Last hour ); const failed = recent.filter(f => f.state === 'FAILED'); const errorRate = recent.length > 0 ? (failed.length / recent.length) * 100 : 0; return { status: errorRate > this.alertThresholds.errorRate ? 'unhealthy' : 'healthy', recentFlows: recent.length, failedFlows: failed.length, errorRate: Math.round(errorRate * 100) / 100 }; } catch (error) { return { status: 'unhealthy', error: error.message }; } } async checkSystemHealth() { const used = process.memoryUsage(); const cpuUsage = process.cpuUsage(); return { status: 'healthy', memory: { used: Math.round(used.heapUsed / 1024 / 1024), total: Math.round(used.heapTotal / 1024 / 1024) }, uptime: Math.round(process.uptime()) }; } calculateOverallStatus(checks) { const statuses = Object.values(checks).map(check => check.status); return statuses.every(status => status === 'healthy') ? 'healthy' : 'unhealthy'; } } module.exports = HealthMonitor;
This comprehensive monitoring and alerting setup ensures you have full visibility into your Donobu automation flows, with real-time notifications and detailed metrics to maintain high reliability and performance.