Devin Major 0836fc0fbc first commit (copy of Trunk as of April 22 2026)

2026-04-22 15:00:02 -04:00

24 KiB

Raw Blame History

Simplified Partner System Monitoring Guide

Overview

This guide provides a lightweight monitoring approach for the partner integration system, focusing on essential health checks and logging without complex infrastructure requirements.

Basic Monitoring Strategy

1. Essential Health Checks

Simple Health Check Implementation

// File: services/monitoring/SimpleHealthChecker.js
class SimpleHealthChecker {
  constructor() {
    this.partnerConfig = require('../helpers/partner_config');
  }

  async performBasicHealthCheck() {
    const results = {
      timestamp: new Date(),
      overall: 'healthy',
      components: {}
    };

    try {
      // Check database connectivity
      results.components.database = await this.checkDatabase();
      
      // Check partner system users
      results.components.partnerUsers = await this.checkPartnerUsers();
      
      // Check application health
      results.components.application = this.checkApplication();

      // Determine overall health
      results.overall = this.calculateOverallHealth(results.components);
      
    } catch (error) {
      results.overall = 'unhealthy';
      results.error = error.message;
    }

    return results;
  }

  async checkDatabase() {
    try {
      const start = Date.now();
      await require('mongoose').connection.db.admin().ping();
      
      return {
        status: 'healthy',
        responseTime: Date.now() - start
      };
    } catch (error) {
      return {
        status: 'unhealthy',
        error: error.message
      };
    }
  }

  async checkPartnerUsers() {
    try {
      const { PartnerSystemUser } = require('../model/partner');
      const activeUsers = await PartnerSystemUser.countDocuments({ active: true });
      const errorUsers = await PartnerSystemUser.countDocuments({ syncStatus: 'error' });
      
      return {
        status: errorUsers < activeUsers * 0.5 ? 'healthy' : 'degraded',
        details: {
          activeUsers,
          errorUsers,
          errorRate: activeUsers > 0 ? (errorUsers / activeUsers * 100).toFixed(2) : 0
        }
      };
    } catch (error) {
      return {
        status: 'unhealthy',
        error: error.message
      };
    }
  }

  checkApplication() {
    const memoryUsage = process.memoryUsage();
    const uptime = process.uptime();
    const memoryThreshold = 1024 * 1024 * 1024; // 1GB

    return {
      status: memoryUsage.heapUsed < memoryThreshold ? 'healthy' : 'degraded',
      details: {
        memoryUsedMB: Math.round(memoryUsage.heapUsed / 1024 / 1024),
        uptimeHours: Math.round(uptime / 3600 * 100) / 100
      }
    };
  }

  calculateOverallHealth(components) {
    const statuses = Object.values(components).map(c => c.status);
    
    if (statuses.includes('unhealthy')) {
      return 'unhealthy';
    } else if (statuses.includes('degraded')) {
      return 'degraded';
    } else {
      return 'healthy';
    }
  }

  // Express middleware for health check endpoint
  healthCheckMiddleware() {
    return async (req, res) => {
      const health = await this.performBasicHealthCheck();
      const statusCode = health.overall === 'healthy' ? 200 : 
                        health.overall === 'degraded' ? 200 : 503;
      
      res.status(statusCode).json(health);
    };
  }
}

module.exports = new SimpleHealthChecker();

2. Basic Logging Strategy

Simple Logger Implementation

// File: services/monitoring/SimpleLogger.js
const winston = require('winston');

class SimpleLogger {
  constructor() {
    this.logger = winston.createLogger({
      level: process.env.LOG_LEVEL || 'info',
      format: winston.format.combine(
        winston.format.timestamp(),
        winston.format.json()
      ),
      transports: [
        new winston.transports.File({ 
          filename: 'logs/partner-errors.log', 
          level: 'error' 
        }),
        new winston.transports.File({ 
          filename: 'logs/partner-activity.log' 
        })
      ]
    });

    if (process.env.NODE_ENV !== 'production') {
      this.logger.add(new winston.transports.Console({
        format: winston.format.simple()
      }));
    }
  }

  // Log partner operations (success/failure)
  logPartnerOperation(operation, partner, success, metadata = {}) {
    const logData = {
      operation,
      partner,
      success,
      timestamp: new Date(),
      ...metadata
    };

    if (success) {
      this.logger.info('Partner operation completed', logData);
    } else {
      this.logger.error('Partner operation failed', logData);
    }
  }

  // Log critical errors
  logError(error, context = {}) {
    this.logger.error('Partner system error', {
      message: error.message,
      stack: error.stack,
      context,
      timestamp: new Date()
    });
  }

  // Log sync activities
  logSync(customerId, partnerId, operation, status, metadata = {}) {
    this.logger.info('Partner sync activity', {
      customerId,
      partnerId,
      operation,
      status,
      metadata,
      timestamp: new Date()
    });
  }
}

module.exports = new SimpleLogger();

Key Metrics to Monitor

1. Partner System User Health

Active partner system users count
Error rate per partner
Last successful sync per customer

2. API Call Success Rates

Successful vs failed API calls per partner
Response times (basic timing)
Authentication failures

3. Application Health

Memory usage
Uptime
Database connectivity

Simple Alerting

Email Alerts for Critical Issues

// File: services/monitoring/SimpleAlerting.js
const nodemailer = require('nodemailer');

class SimpleAlerting {
  constructor() {
    this.transporter = nodemailer.createTransporter({
      host: process.env.SMTP_HOST,
      port: process.env.SMTP_PORT,
      secure: false,
      auth: {
        user: process.env.SMTP_USER,
        pass: process.env.SMTP_PASS
      }
    });
    
    this.alertEmails = (process.env.ALERT_EMAILS || '').split(',');
  }

  async sendAlert(subject, message, severity = 'warning') {
    if (!this.alertEmails.length) return;

    const emailContent = {
      from: process.env.ALERT_FROM_EMAIL,
      to: this.alertEmails.join(','),
      subject: `[${severity.toUpperCase()}] ${subject}`,
      text: message,
      html: `<pre>${message}</pre>`
    };

    try {
      await this.transporter.sendMail(emailContent);
    } catch (error) {
      console.error('Failed to send alert email:', error);
    }
  }

  // Alert when partner sync fails repeatedly
  async alertPartnerSyncFailure(customerId, partnerId, errorCount) {
    if (errorCount >= 5) {
      await this.sendAlert(
        'Partner Sync Failure',
        `Customer ${customerId} has failed to sync with partner ${partnerId} ${errorCount} times consecutively.`,
        'critical'
      );
    }
  }

  // Alert when partner API is down
  async alertPartnerDown(partnerCode, error) {
    await this.sendAlert(
      'Partner API Down',
      `Partner ${partnerCode} API is not responding: ${error}`,
      'critical'
    );
  }
}

module.exports = new SimpleAlerting();

Dashboard Implementation

Simple HTML Dashboard

<!-- File: public/partner-dashboard.html -->
<!DOCTYPE html>
<html>
<head>
    <title>Partner Integration Dashboard</title>
    <style>
        .status-healthy { color: green; }
        .status-degraded { color: orange; }
        .status-unhealthy { color: red; }
        .card { border: 1px solid #ddd; margin: 10px; padding: 15px; }
    </style>
</head>
<body>
    <h1>Partner Integration Dashboard</h1>
    
    <div id="health-status" class="card">
        <h2>System Health</h2>
        <div id="health-content">Loading...</div>
    </div>

    <div id="partner-stats" class="card">
        <h2>Partner Statistics</h2>
        <div id="stats-content">Loading...</div>
    </div>

    <script>
        async function loadHealthStatus() {
            try {
                const response = await fetch('/api/health');
                const health = await response.json();
                
                document.getElementById('health-content').innerHTML = `
                    <p>Overall Status: <span class="status-${health.overall}">${health.overall}</span></p>
                    <p>Database: <span class="status-${health.components.database.status}">${health.components.database.status}</span></p>
                    <p>Partner Users: <span class="status-${health.components.partnerUsers.status}">${health.components.partnerUsers.status}</span></p>
                    <p>Application: <span class="status-${health.components.application.status}">${health.components.application.status}</span></p>
                `;
            } catch (error) {
                document.getElementById('health-content').innerHTML = 'Error loading health status';
            }
        }

        async function loadPartnerStats() {
            try {
                const response = await fetch('/api/partners/stats');
                const stats = await response.json();
                
                document.getElementById('stats-content').innerHTML = `
                    <p>Total Partners: ${stats.totalPartners}</p>
                    <p>Active Partner Users: ${stats.activePartnerUsers}</p>
                    <p>Recent Sync Errors: ${stats.recentErrors}</p>
                `;
            } catch (error) {
                document.getElementById('stats-content').innerHTML = 'Error loading partner stats';
            }
        }

        // Auto-refresh every 30 seconds
        setInterval(() => {
            loadHealthStatus();
            loadPartnerStats();
        }, 30000);

        // Initial load
        loadHealthStatus();
        loadPartnerStats();
    </script>
</body>
</html>

This simplified monitoring approach provides essential visibility without requiring complex infrastructure like Grafana, Prometheus, or extensive logging systems. timestamp: new Date(), overall: 'healthy', components: {}, duration: 0 };

try {
  // Check database connectivity
  results.components.database = await this.checkDatabase();
  
  // Check queue system
  results.components.queues = await this.checkQueues();
  
  // Check partner connectivity
  results.components.partners = await this.checkPartners();
  
  // Check application health
  results.components.application = await this.checkApplication();

  // Determine overall health
  results.overall = this.calculateOverallHealth(results.components);
  
} catch (error) {
  results.overall = 'unhealthy';
  results.error = error.message;
}

results.duration = Date.now() - startTime;
this.healthStatus = results;

return results;

}

async checkDatabase() { try { const start = Date.now(); await this.database.db.admin().ping();

  return {
    status: 'healthy',
    responseTime: Date.now() - start,
    details: {
      connected: true,
      readyState: this.database.connection.readyState
    }
  };
} catch (error) {
  return {
    status: 'unhealthy',
    error: error.message
  };
}

}

async checkQueues() { try { const stats = await this.queueSystem.getQueueStats(); const totalPending = Object.values(stats).reduce((sum, queue) => sum + queue.waiting + queue.active, 0);

  const isHealthy = totalPending < 1000; // Threshold for healthy queue

  return {
    status: isHealthy ? 'healthy' : 'degraded',
    details: {
      totalPending,
      stats
    }
  };
} catch (error) {
  return {
    status: 'unhealthy',
    error: error.message
  };
}

}

async checkPartners() { const partnerResults = {}; const partners = this.partnerRegistry.getAll();

for (const partner of partners) {
  try {
    const result = await partner.healthCheck();
    partnerResults[partner.getPartnerType()] = result;
  } catch (error) {
    partnerResults[partner.getPartnerType()] = {
      status: 'unhealthy',
      error: error.message
    };
  }
}

const healthyPartners = Object.values(partnerResults)
  .filter(p => p.status === 'healthy').length;
const totalPartners = Object.keys(partnerResults).length;

return {
  status: healthyPartners > 0 ? 'healthy' : 'unhealthy',
  healthyCount: healthyPartners,
  totalCount: totalPartners,
  details: partnerResults
};

}

async checkApplication() { try { const memoryUsage = process.memoryUsage(); const cpuUsage = process.cpuUsage(); const uptime = process.uptime();

  const memoryThreshold = 1024 * 1024 * 1024; // 1GB
  const isMemoryHealthy = memoryUsage.heapUsed < memoryThreshold;

  return {
    status: isMemoryHealthy ? 'healthy' : 'degraded',
    details: {
      memory: memoryUsage,
      cpu: cpuUsage,
      uptime,
      nodeVersion: process.version
    }
  };
} catch (error) {
  return {
    status: 'unhealthy',
    error: error.message
  };
}

}

calculateOverallHealth(components) { const statuses = Object.values(components).map(c => c.status);

if (statuses.includes('unhealthy')) {
  return 'unhealthy';
} else if (statuses.includes('degraded')) {
  return 'degraded';
} else {
  return 'healthy';
}

}

getHealthStatus() { return this.healthStatus; }

// Express middleware for health check endpoint healthCheckMiddleware() { return async (req, res) => { const health = await this.performHealthCheck(); const statusCode = health.overall === 'healthy' ? 200 : health.overall === 'degraded' ? 200 : 503;

  res.status(statusCode).json(health);
};

} }

module.exports = HealthChecker;


### 3. Structured Logging

```javascript
// File: services/monitoring/Logger.js
const winston = require('winston');

class Logger {
  constructor() {
    this.logger = winston.createLogger({
      level: process.env.LOG_LEVEL || 'info',
      format: winston.format.combine(
        winston.format.timestamp(),
        winston.format.errors({ stack: true }),
        winston.format.json()
      ),
      defaultMeta: { 
        service: 'agmission-partner-integration',
        version: process.env.APP_VERSION || '1.0.0'
      },
      transports: [
        new winston.transports.File({ 
          filename: 'logs/error.log', 
          level: 'error' 
        }),
        new winston.transports.File({ 
          filename: 'logs/combined.log' 
        })
      ]
    });

    if (process.env.NODE_ENV !== 'production') {
      this.logger.add(new winston.transports.Console({
        format: winston.format.simple()
      }));
    }
  }

  // Partner operation logging
  logPartnerOperation(operation, partner, data, duration, success = true) {
    const logData = {
      operation,
      partner,
      duration,
      success,
      timestamp: new Date(),
      ...data
    };

    if (success) {
      this.logger.info('Partner operation completed', logData);
    } else {
      this.logger.error('Partner operation failed', logData);
    }
  }

  // Assignment logging
  logAssignment(jobId, userId, partnerType, status, metadata = {}) {
    this.logger.info('Job assignment', {
      jobId,
      userId,
      partnerType,
      status,
      metadata,
      timestamp: new Date()
    });
  }

  // Sync operation logging
  logSync(assignmentId, operation, status, duration, metadata = {}) {
    const logData = {
      assignmentId,
      operation,
      status,
      duration,
      metadata,
      timestamp: new Date()
    };

    if (status === 'success') {
      this.logger.info('Sync operation completed', logData);
    } else {
      this.logger.error('Sync operation failed', logData);
    }
  }

  // Data processing logging
  logDataProcessing(applicationId, stage, status, metadata = {}) {
    this.logger.info('Data processing stage', {
      applicationId,
      stage,
      status,
      metadata,
      timestamp: new Date()
    });
  }

  // Error logging with context
  logError(error, context = {}) {
    this.logger.error('Application error', {
      message: error.message,
      stack: error.stack,
      context,
      timestamp: new Date()
    });
  }

  // Performance logging
  logPerformance(operation, duration, metadata = {}) {
    this.logger.info('Performance metric', {
      operation,
      duration,
      metadata,
      timestamp: new Date()
    });
  }
}

module.exports = new Logger();

Dashboard Configuration

1. Grafana Dashboard JSON

{
  "dashboard": {
    "id": null,
    "title": "Partner Integration Monitoring",
    "tags": ["agmission", "partners"],
    "timezone": "UTC",
    "panels": [
      {
        "id": 1,
        "title": "Partner API Response Times",
        "type": "stat",
        "targets": [
          {
            "expr": "avg(partner_api_duration_seconds) by (partner)",
            "legendFormat": "{{partner}} avg"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "s",
            "thresholds": {
              "steps": [
                {"color": "green", "value": null},
                {"color": "yellow", "value": 2},
                {"color": "red", "value": 5}
              ]
            }
          }
        }
      },
      {
        "id": 2,
        "title": "Queue Depth",
        "type": "graph",
        "targets": [
          {
            "expr": "queue_depth",
            "legendFormat": "{{queue_type}} - {{partner}}"
          }
        ]
      },
      {
        "id": 3,
        "title": "Assignment Success Rate",
        "type": "stat",
        "targets": [
          {
            "expr": "rate(job_assignments_total{status=\"success\"}[5m]) / rate(job_assignments_total[5m]) * 100",
            "legendFormat": "Success Rate %"
          }
        ]
      },
      {
        "id": 4,
        "title": "Error Rates by Partner",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(partner_api_requests_total{status!=\"success\"}[5m])",
            "legendFormat": "{{partner}} errors/sec"
          }
        ]
      }
    ],
    "time": {
      "from": "now-1h",
      "to": "now"
    },
    "refresh": "30s"
  }
}

2. Alert Rules Configuration

# File: monitoring/alerts.yml
groups:
  - name: partner_integration
    rules:
      - alert: PartnerAPIHighErrorRate
        expr: rate(partner_api_requests_total{status!="success"}[5m]) > 0.1
        for: 2m
        labels:
          severity: warning
          team: integration
        annotations:
          summary: "High error rate for partner {{ $labels.partner }}"
          description: "Partner {{ $labels.partner }} has error rate of {{ $value }} errors/sec"

      - alert: PartnerAPIDown
        expr: up{job="partner_api"} == 0
        for: 1m
        labels:
          severity: critical
          team: integration
        annotations:
          summary: "Partner API {{ $labels.partner }} is down"
          description: "Partner {{ $labels.partner }} API has been unreachable for over 1 minute"

      - alert: QueueDepthHigh
        expr: queue_depth > 1000
        for: 5m
        labels:
          severity: warning
          team: integration
        annotations:
          summary: "High queue depth for {{ $labels.queue_type }}"
          description: "Queue {{ $labels.queue_type }} has {{ $value }} pending jobs"

      - alert: SyncFailureRateHigh
        expr: rate(sync_duration_seconds{status="failed"}[10m]) > 0.05
        for: 5m
        labels:
          severity: warning
          team: integration
        annotations:
          summary: "High sync failure rate for {{ $labels.partner }}"
          description: "Partner {{ $labels.partner }} sync failure rate is {{ $value }}"

      - alert: DataProcessingStuck
        expr: increase(data_processing_duration_seconds_count[1h]) == 0
        for: 30m
        labels:
          severity: critical
          team: integration
        annotations:
          summary: "Data processing appears stuck"
          description: "No data processing completions in the last hour"

Troubleshooting Playbook

1. Partner API Issues

Symptoms

High error rates in partner API calls
Timeouts or connection failures
Authentication errors

Investigation Steps

Check partner API status dashboard
Verify network connectivity to partner
Validate API credentials and tokens
Review partner API rate limits
Check partner service logs

Resolution Actions

# Check partner connectivity
curl -H "Authorization: Bearer $TOKEN" https://api.partner.com/health

# Review recent errors
kubectl logs -f deployment/agmission-api | grep "partner.*error"

# Restart partner service if needed
kubectl rollout restart deployment/agmission-api

2. Queue Processing Issues

Symptoms

Increasing queue depth
Jobs stuck in pending status
Worker processes crashing

Investigation Steps

Check queue worker health and logs
Verify Redis connectivity
Monitor memory and CPU usage
Check for deadlocks in job processing

Resolution Actions

# Check queue status
redis-cli -h redis-host info keyspace

# Restart queue workers
pm2 restart partner-sync-worker

# Clear stuck jobs (use with caution)
redis-cli -h redis-host flushdb

3. Data Sync Issues

Symptoms

Jobs not syncing to partners
Data not being retrieved from partners
Sync operations timing out

Investigation Steps

Check assignment sync states in database
Verify partner job creation
Review sync queue processing
Check for configuration issues

Resolution Actions

// Manual sync trigger via API
POST /api/sync/assignments/{assignmentId}/sync
{
  "operation": "job_upload",
  "force": true
}

// Database query to check sync status
db.job_assigns.find({
  "syncState.jobUpload.status": "failed",
  "partnerType": "satloc"
}).limit(10);

4. Performance Issues

Symptoms

Slow response times
High memory usage
Database query timeouts

Investigation Steps

Monitor application performance metrics
Check database query performance
Review memory and CPU utilization
Analyze slow API endpoints

Resolution Actions

# Scale up application pods
kubectl scale deployment agmission-api --replicas=3

# Check database performance
db.runCommand({currentOp: 1, "secs_running": {$gte: 5}})

# Monitor memory usage
kubectl top pods -l app=agmission-api

Incident Response Procedures

1. Incident Classification

Severity	Response Time	Example Issues
P0 - Critical	15 minutes	Complete system outage, data loss
P1 - High	1 hour	Partner integration down, major feature broken
P2 - Medium	4 hours	Performance degradation, minor feature issues
P3 - Low	24 hours	Cosmetic issues, non-critical bugs

2. Escalation Matrix

Level 1: On-call Engineer
├── Initial response and investigation
├── Follow standard playbooks
└── Escalate to Level 2 if needed

Level 2: Senior Engineer + Team Lead
├── Complex troubleshooting
├── Architecture decisions
└── Escalate to Level 3 if needed

Level 3: Architect + Management
├── System design issues
├── Business impact decisions
└── External vendor coordination

3. Communication Templates

Initial Alert

🚨 INCIDENT: Partner Integration Issue
Severity: P1
Impact: Satloc jobs not syncing
Started: 2025-07-18 10:30 UTC
Assigned: @engineer-oncall
Status: Investigating
Next Update: 10:45 UTC

Status Update

📊 UPDATE: Partner Integration Issue
Severity: P1
Root Cause: Partner API rate limiting
Action: Implementing exponential backoff
ETA: 11:00 UTC for resolution
Next Update: 11:00 UTC

Resolution Notice

✅ RESOLVED: Partner Integration Issue
Duration: 30 minutes
Root Cause: Partner API rate limiting
Fix: Updated retry logic with exponential backoff
Prevention: Added rate limit monitoring
Post-mortem: Scheduled for 2025-07-19 14:00 UTC

This monitoring and observability strategy provides comprehensive visibility into the partner integration system, enabling proactive issue detection and rapid incident response.

24 KiB Raw Blame History

Simplified Partner System Monitoring Guide

Overview

Basic Monitoring Strategy

1. Essential Health Checks

Simple Health Check Implementation

2. Basic Logging Strategy

Simple Logger Implementation

Key Metrics to Monitor

1. Partner System User Health

2. API Call Success Rates

3. Application Health

Simple Alerting

Email Alerts for Critical Issues

Dashboard Implementation

Simple HTML Dashboard

Dashboard Configuration

1. Grafana Dashboard JSON

2. Alert Rules Configuration

Troubleshooting Playbook

1. Partner API Issues

Symptoms

Investigation Steps

Resolution Actions

2. Queue Processing Issues

Symptoms

Investigation Steps

Resolution Actions

3. Data Sync Issues

Symptoms

Investigation Steps

Resolution Actions

4. Performance Issues

Symptoms

Investigation Steps

Resolution Actions

Incident Response Procedures

1. Incident Classification

2. Escalation Matrix

3. Communication Templates

Initial Alert

Status Update

Resolution Notice

24 KiB

Raw Blame History