agmission/Development/server/docs/archived/MONITORING_GUIDE.md

24 KiB

Simplified Partner System Monitoring Guide

Overview

This guide provides a lightweight monitoring approach for the partner integration system, focusing on essential health checks and logging without complex infrastructure requirements.

Basic Monitoring Strategy

1. Essential Health Checks

Simple Health Check Implementation

// File: services/monitoring/SimpleHealthChecker.js
class SimpleHealthChecker {
  constructor() {
    this.partnerConfig = require('../helpers/partner_config');
  }

  async performBasicHealthCheck() {
    const results = {
      timestamp: new Date(),
      overall: 'healthy',
      components: {}
    };

    try {
      // Check database connectivity
      results.components.database = await this.checkDatabase();
      
      // Check partner system users
      results.components.partnerUsers = await this.checkPartnerUsers();
      
      // Check application health
      results.components.application = this.checkApplication();

      // Determine overall health
      results.overall = this.calculateOverallHealth(results.components);
      
    } catch (error) {
      results.overall = 'unhealthy';
      results.error = error.message;
    }

    return results;
  }

  async checkDatabase() {
    try {
      const start = Date.now();
      await require('mongoose').connection.db.admin().ping();
      
      return {
        status: 'healthy',
        responseTime: Date.now() - start
      };
    } catch (error) {
      return {
        status: 'unhealthy',
        error: error.message
      };
    }
  }

  async checkPartnerUsers() {
    try {
      const { PartnerSystemUser } = require('../model/partner');
      const activeUsers = await PartnerSystemUser.countDocuments({ active: true });
      const errorUsers = await PartnerSystemUser.countDocuments({ syncStatus: 'error' });
      
      return {
        status: errorUsers < activeUsers * 0.5 ? 'healthy' : 'degraded',
        details: {
          activeUsers,
          errorUsers,
          errorRate: activeUsers > 0 ? (errorUsers / activeUsers * 100).toFixed(2) : 0
        }
      };
    } catch (error) {
      return {
        status: 'unhealthy',
        error: error.message
      };
    }
  }

  checkApplication() {
    const memoryUsage = process.memoryUsage();
    const uptime = process.uptime();
    const memoryThreshold = 1024 * 1024 * 1024; // 1GB

    return {
      status: memoryUsage.heapUsed < memoryThreshold ? 'healthy' : 'degraded',
      details: {
        memoryUsedMB: Math.round(memoryUsage.heapUsed / 1024 / 1024),
        uptimeHours: Math.round(uptime / 3600 * 100) / 100
      }
    };
  }

  calculateOverallHealth(components) {
    const statuses = Object.values(components).map(c => c.status);
    
    if (statuses.includes('unhealthy')) {
      return 'unhealthy';
    } else if (statuses.includes('degraded')) {
      return 'degraded';
    } else {
      return 'healthy';
    }
  }

  // Express middleware for health check endpoint
  healthCheckMiddleware() {
    return async (req, res) => {
      const health = await this.performBasicHealthCheck();
      const statusCode = health.overall === 'healthy' ? 200 : 
                        health.overall === 'degraded' ? 200 : 503;
      
      res.status(statusCode).json(health);
    };
  }
}

module.exports = new SimpleHealthChecker();

2. Basic Logging Strategy

Simple Logger Implementation

// File: services/monitoring/SimpleLogger.js
const winston = require('winston');

class SimpleLogger {
  constructor() {
    this.logger = winston.createLogger({
      level: process.env.LOG_LEVEL || 'info',
      format: winston.format.combine(
        winston.format.timestamp(),
        winston.format.json()
      ),
      transports: [
        new winston.transports.File({ 
          filename: 'logs/partner-errors.log', 
          level: 'error' 
        }),
        new winston.transports.File({ 
          filename: 'logs/partner-activity.log' 
        })
      ]
    });

    if (process.env.NODE_ENV !== 'production') {
      this.logger.add(new winston.transports.Console({
        format: winston.format.simple()
      }));
    }
  }

  // Log partner operations (success/failure)
  logPartnerOperation(operation, partner, success, metadata = {}) {
    const logData = {
      operation,
      partner,
      success,
      timestamp: new Date(),
      ...metadata
    };

    if (success) {
      this.logger.info('Partner operation completed', logData);
    } else {
      this.logger.error('Partner operation failed', logData);
    }
  }

  // Log critical errors
  logError(error, context = {}) {
    this.logger.error('Partner system error', {
      message: error.message,
      stack: error.stack,
      context,
      timestamp: new Date()
    });
  }

  // Log sync activities
  logSync(customerId, partnerId, operation, status, metadata = {}) {
    this.logger.info('Partner sync activity', {
      customerId,
      partnerId,
      operation,
      status,
      metadata,
      timestamp: new Date()
    });
  }
}

module.exports = new SimpleLogger();

Key Metrics to Monitor

1. Partner System User Health

  • Active partner system users count
  • Error rate per partner
  • Last successful sync per customer

2. API Call Success Rates

  • Successful vs failed API calls per partner
  • Response times (basic timing)
  • Authentication failures

3. Application Health

  • Memory usage
  • Uptime
  • Database connectivity

Simple Alerting

Email Alerts for Critical Issues

// File: services/monitoring/SimpleAlerting.js
const nodemailer = require('nodemailer');

class SimpleAlerting {
  constructor() {
    this.transporter = nodemailer.createTransporter({
      host: process.env.SMTP_HOST,
      port: process.env.SMTP_PORT,
      secure: false,
      auth: {
        user: process.env.SMTP_USER,
        pass: process.env.SMTP_PASS
      }
    });
    
    this.alertEmails = (process.env.ALERT_EMAILS || '').split(',');
  }

  async sendAlert(subject, message, severity = 'warning') {
    if (!this.alertEmails.length) return;

    const emailContent = {
      from: process.env.ALERT_FROM_EMAIL,
      to: this.alertEmails.join(','),
      subject: `[${severity.toUpperCase()}] ${subject}`,
      text: message,
      html: `<pre>${message}</pre>`
    };

    try {
      await this.transporter.sendMail(emailContent);
    } catch (error) {
      console.error('Failed to send alert email:', error);
    }
  }

  // Alert when partner sync fails repeatedly
  async alertPartnerSyncFailure(customerId, partnerId, errorCount) {
    if (errorCount >= 5) {
      await this.sendAlert(
        'Partner Sync Failure',
        `Customer ${customerId} has failed to sync with partner ${partnerId} ${errorCount} times consecutively.`,
        'critical'
      );
    }
  }

  // Alert when partner API is down
  async alertPartnerDown(partnerCode, error) {
    await this.sendAlert(
      'Partner API Down',
      `Partner ${partnerCode} API is not responding: ${error}`,
      'critical'
    );
  }
}

module.exports = new SimpleAlerting();

Dashboard Implementation

Simple HTML Dashboard

<!-- File: public/partner-dashboard.html -->
<!DOCTYPE html>
<html>
<head>
    <title>Partner Integration Dashboard</title>
    <style>
        .status-healthy { color: green; }
        .status-degraded { color: orange; }
        .status-unhealthy { color: red; }
        .card { border: 1px solid #ddd; margin: 10px; padding: 15px; }
    </style>
</head>
<body>
    <h1>Partner Integration Dashboard</h1>
    
    <div id="health-status" class="card">
        <h2>System Health</h2>
        <div id="health-content">Loading...</div>
    </div>

    <div id="partner-stats" class="card">
        <h2>Partner Statistics</h2>
        <div id="stats-content">Loading...</div>
    </div>

    <script>
        async function loadHealthStatus() {
            try {
                const response = await fetch('/api/health');
                const health = await response.json();
                
                document.getElementById('health-content').innerHTML = `
                    <p>Overall Status: <span class="status-${health.overall}">${health.overall}</span></p>
                    <p>Database: <span class="status-${health.components.database.status}">${health.components.database.status}</span></p>
                    <p>Partner Users: <span class="status-${health.components.partnerUsers.status}">${health.components.partnerUsers.status}</span></p>
                    <p>Application: <span class="status-${health.components.application.status}">${health.components.application.status}</span></p>
                `;
            } catch (error) {
                document.getElementById('health-content').innerHTML = 'Error loading health status';
            }
        }

        async function loadPartnerStats() {
            try {
                const response = await fetch('/api/partners/stats');
                const stats = await response.json();
                
                document.getElementById('stats-content').innerHTML = `
                    <p>Total Partners: ${stats.totalPartners}</p>
                    <p>Active Partner Users: ${stats.activePartnerUsers}</p>
                    <p>Recent Sync Errors: ${stats.recentErrors}</p>
                `;
            } catch (error) {
                document.getElementById('stats-content').innerHTML = 'Error loading partner stats';
            }
        }

        // Auto-refresh every 30 seconds
        setInterval(() => {
            loadHealthStatus();
            loadPartnerStats();
        }, 30000);

        // Initial load
        loadHealthStatus();
        loadPartnerStats();
    </script>
</body>
</html>

This simplified monitoring approach provides essential visibility without requiring complex infrastructure like Grafana, Prometheus, or extensive logging systems. timestamp: new Date(), overall: 'healthy', components: {}, duration: 0 };

try {
  // Check database connectivity
  results.components.database = await this.checkDatabase();
  
  // Check queue system
  results.components.queues = await this.checkQueues();
  
  // Check partner connectivity
  results.components.partners = await this.checkPartners();
  
  // Check application health
  results.components.application = await this.checkApplication();

  // Determine overall health
  results.overall = this.calculateOverallHealth(results.components);
  
} catch (error) {
  results.overall = 'unhealthy';
  results.error = error.message;
}

results.duration = Date.now() - startTime;
this.healthStatus = results;

return results;

}

async checkDatabase() { try { const start = Date.now(); await this.database.db.admin().ping();

  return {
    status: 'healthy',
    responseTime: Date.now() - start,
    details: {
      connected: true,
      readyState: this.database.connection.readyState
    }
  };
} catch (error) {
  return {
    status: 'unhealthy',
    error: error.message
  };
}

}

async checkQueues() { try { const stats = await this.queueSystem.getQueueStats(); const totalPending = Object.values(stats).reduce((sum, queue) => sum + queue.waiting + queue.active, 0);

  const isHealthy = totalPending < 1000; // Threshold for healthy queue

  return {
    status: isHealthy ? 'healthy' : 'degraded',
    details: {
      totalPending,
      stats
    }
  };
} catch (error) {
  return {
    status: 'unhealthy',
    error: error.message
  };
}

}

async checkPartners() { const partnerResults = {}; const partners = this.partnerRegistry.getAll();

for (const partner of partners) {
  try {
    const result = await partner.healthCheck();
    partnerResults[partner.getPartnerType()] = result;
  } catch (error) {
    partnerResults[partner.getPartnerType()] = {
      status: 'unhealthy',
      error: error.message
    };
  }
}

const healthyPartners = Object.values(partnerResults)
  .filter(p => p.status === 'healthy').length;
const totalPartners = Object.keys(partnerResults).length;

return {
  status: healthyPartners > 0 ? 'healthy' : 'unhealthy',
  healthyCount: healthyPartners,
  totalCount: totalPartners,
  details: partnerResults
};

}

async checkApplication() { try { const memoryUsage = process.memoryUsage(); const cpuUsage = process.cpuUsage(); const uptime = process.uptime();

  const memoryThreshold = 1024 * 1024 * 1024; // 1GB
  const isMemoryHealthy = memoryUsage.heapUsed < memoryThreshold;

  return {
    status: isMemoryHealthy ? 'healthy' : 'degraded',
    details: {
      memory: memoryUsage,
      cpu: cpuUsage,
      uptime,
      nodeVersion: process.version
    }
  };
} catch (error) {
  return {
    status: 'unhealthy',
    error: error.message
  };
}

}

calculateOverallHealth(components) { const statuses = Object.values(components).map(c => c.status);

if (statuses.includes('unhealthy')) {
  return 'unhealthy';
} else if (statuses.includes('degraded')) {
  return 'degraded';
} else {
  return 'healthy';
}

}

getHealthStatus() { return this.healthStatus; }

// Express middleware for health check endpoint healthCheckMiddleware() { return async (req, res) => { const health = await this.performHealthCheck(); const statusCode = health.overall === 'healthy' ? 200 : health.overall === 'degraded' ? 200 : 503;

  res.status(statusCode).json(health);
};

} }

module.exports = HealthChecker;


### 3. Structured Logging

```javascript
// File: services/monitoring/Logger.js
const winston = require('winston');

class Logger {
  constructor() {
    this.logger = winston.createLogger({
      level: process.env.LOG_LEVEL || 'info',
      format: winston.format.combine(
        winston.format.timestamp(),
        winston.format.errors({ stack: true }),
        winston.format.json()
      ),
      defaultMeta: { 
        service: 'agmission-partner-integration',
        version: process.env.APP_VERSION || '1.0.0'
      },
      transports: [
        new winston.transports.File({ 
          filename: 'logs/error.log', 
          level: 'error' 
        }),
        new winston.transports.File({ 
          filename: 'logs/combined.log' 
        })
      ]
    });

    if (process.env.NODE_ENV !== 'production') {
      this.logger.add(new winston.transports.Console({
        format: winston.format.simple()
      }));
    }
  }

  // Partner operation logging
  logPartnerOperation(operation, partner, data, duration, success = true) {
    const logData = {
      operation,
      partner,
      duration,
      success,
      timestamp: new Date(),
      ...data
    };

    if (success) {
      this.logger.info('Partner operation completed', logData);
    } else {
      this.logger.error('Partner operation failed', logData);
    }
  }

  // Assignment logging
  logAssignment(jobId, userId, partnerType, status, metadata = {}) {
    this.logger.info('Job assignment', {
      jobId,
      userId,
      partnerType,
      status,
      metadata,
      timestamp: new Date()
    });
  }

  // Sync operation logging
  logSync(assignmentId, operation, status, duration, metadata = {}) {
    const logData = {
      assignmentId,
      operation,
      status,
      duration,
      metadata,
      timestamp: new Date()
    };

    if (status === 'success') {
      this.logger.info('Sync operation completed', logData);
    } else {
      this.logger.error('Sync operation failed', logData);
    }
  }

  // Data processing logging
  logDataProcessing(applicationId, stage, status, metadata = {}) {
    this.logger.info('Data processing stage', {
      applicationId,
      stage,
      status,
      metadata,
      timestamp: new Date()
    });
  }

  // Error logging with context
  logError(error, context = {}) {
    this.logger.error('Application error', {
      message: error.message,
      stack: error.stack,
      context,
      timestamp: new Date()
    });
  }

  // Performance logging
  logPerformance(operation, duration, metadata = {}) {
    this.logger.info('Performance metric', {
      operation,
      duration,
      metadata,
      timestamp: new Date()
    });
  }
}

module.exports = new Logger();

Dashboard Configuration

1. Grafana Dashboard JSON

{
  "dashboard": {
    "id": null,
    "title": "Partner Integration Monitoring",
    "tags": ["agmission", "partners"],
    "timezone": "UTC",
    "panels": [
      {
        "id": 1,
        "title": "Partner API Response Times",
        "type": "stat",
        "targets": [
          {
            "expr": "avg(partner_api_duration_seconds) by (partner)",
            "legendFormat": "{{partner}} avg"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "s",
            "thresholds": {
              "steps": [
                {"color": "green", "value": null},
                {"color": "yellow", "value": 2},
                {"color": "red", "value": 5}
              ]
            }
          }
        }
      },
      {
        "id": 2,
        "title": "Queue Depth",
        "type": "graph",
        "targets": [
          {
            "expr": "queue_depth",
            "legendFormat": "{{queue_type}} - {{partner}}"
          }
        ]
      },
      {
        "id": 3,
        "title": "Assignment Success Rate",
        "type": "stat",
        "targets": [
          {
            "expr": "rate(job_assignments_total{status=\"success\"}[5m]) / rate(job_assignments_total[5m]) * 100",
            "legendFormat": "Success Rate %"
          }
        ]
      },
      {
        "id": 4,
        "title": "Error Rates by Partner",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(partner_api_requests_total{status!=\"success\"}[5m])",
            "legendFormat": "{{partner}} errors/sec"
          }
        ]
      }
    ],
    "time": {
      "from": "now-1h",
      "to": "now"
    },
    "refresh": "30s"
  }
}

2. Alert Rules Configuration

# File: monitoring/alerts.yml
groups:
  - name: partner_integration
    rules:
      - alert: PartnerAPIHighErrorRate
        expr: rate(partner_api_requests_total{status!="success"}[5m]) > 0.1
        for: 2m
        labels:
          severity: warning
          team: integration
        annotations:
          summary: "High error rate for partner {{ $labels.partner }}"
          description: "Partner {{ $labels.partner }} has error rate of {{ $value }} errors/sec"

      - alert: PartnerAPIDown
        expr: up{job="partner_api"} == 0
        for: 1m
        labels:
          severity: critical
          team: integration
        annotations:
          summary: "Partner API {{ $labels.partner }} is down"
          description: "Partner {{ $labels.partner }} API has been unreachable for over 1 minute"

      - alert: QueueDepthHigh
        expr: queue_depth > 1000
        for: 5m
        labels:
          severity: warning
          team: integration
        annotations:
          summary: "High queue depth for {{ $labels.queue_type }}"
          description: "Queue {{ $labels.queue_type }} has {{ $value }} pending jobs"

      - alert: SyncFailureRateHigh
        expr: rate(sync_duration_seconds{status="failed"}[10m]) > 0.05
        for: 5m
        labels:
          severity: warning
          team: integration
        annotations:
          summary: "High sync failure rate for {{ $labels.partner }}"
          description: "Partner {{ $labels.partner }} sync failure rate is {{ $value }}"

      - alert: DataProcessingStuck
        expr: increase(data_processing_duration_seconds_count[1h]) == 0
        for: 30m
        labels:
          severity: critical
          team: integration
        annotations:
          summary: "Data processing appears stuck"
          description: "No data processing completions in the last hour"

Troubleshooting Playbook

1. Partner API Issues

Symptoms

  • High error rates in partner API calls
  • Timeouts or connection failures
  • Authentication errors

Investigation Steps

  1. Check partner API status dashboard
  2. Verify network connectivity to partner
  3. Validate API credentials and tokens
  4. Review partner API rate limits
  5. Check partner service logs

Resolution Actions

# Check partner connectivity
curl -H "Authorization: Bearer $TOKEN" https://api.partner.com/health

# Review recent errors
kubectl logs -f deployment/agmission-api | grep "partner.*error"

# Restart partner service if needed
kubectl rollout restart deployment/agmission-api

2. Queue Processing Issues

Symptoms

  • Increasing queue depth
  • Jobs stuck in pending status
  • Worker processes crashing

Investigation Steps

  1. Check queue worker health and logs
  2. Verify Redis connectivity
  3. Monitor memory and CPU usage
  4. Check for deadlocks in job processing

Resolution Actions

# Check queue status
redis-cli -h redis-host info keyspace

# Restart queue workers
pm2 restart partner-sync-worker

# Clear stuck jobs (use with caution)
redis-cli -h redis-host flushdb

3. Data Sync Issues

Symptoms

  • Jobs not syncing to partners
  • Data not being retrieved from partners
  • Sync operations timing out

Investigation Steps

  1. Check assignment sync states in database
  2. Verify partner job creation
  3. Review sync queue processing
  4. Check for configuration issues

Resolution Actions

// Manual sync trigger via API
POST /api/sync/assignments/{assignmentId}/sync
{
  "operation": "job_upload",
  "force": true
}

// Database query to check sync status
db.job_assigns.find({
  "syncState.jobUpload.status": "failed",
  "partnerType": "satloc"
}).limit(10);

4. Performance Issues

Symptoms

  • Slow response times
  • High memory usage
  • Database query timeouts

Investigation Steps

  1. Monitor application performance metrics
  2. Check database query performance
  3. Review memory and CPU utilization
  4. Analyze slow API endpoints

Resolution Actions

# Scale up application pods
kubectl scale deployment agmission-api --replicas=3

# Check database performance
db.runCommand({currentOp: 1, "secs_running": {$gte: 5}})

# Monitor memory usage
kubectl top pods -l app=agmission-api

Incident Response Procedures

1. Incident Classification

Severity Response Time Example Issues
P0 - Critical 15 minutes Complete system outage, data loss
P1 - High 1 hour Partner integration down, major feature broken
P2 - Medium 4 hours Performance degradation, minor feature issues
P3 - Low 24 hours Cosmetic issues, non-critical bugs

2. Escalation Matrix

Level 1: On-call Engineer
├── Initial response and investigation
├── Follow standard playbooks
└── Escalate to Level 2 if needed

Level 2: Senior Engineer + Team Lead
├── Complex troubleshooting
├── Architecture decisions
└── Escalate to Level 3 if needed

Level 3: Architect + Management
├── System design issues
├── Business impact decisions
└── External vendor coordination

3. Communication Templates

Initial Alert

🚨 INCIDENT: Partner Integration Issue
Severity: P1
Impact: Satloc jobs not syncing
Started: 2025-07-18 10:30 UTC
Assigned: @engineer-oncall
Status: Investigating
Next Update: 10:45 UTC

Status Update

📊 UPDATE: Partner Integration Issue
Severity: P1
Root Cause: Partner API rate limiting
Action: Implementing exponential backoff
ETA: 11:00 UTC for resolution
Next Update: 11:00 UTC

Resolution Notice

✅ RESOLVED: Partner Integration Issue
Duration: 30 minutes
Root Cause: Partner API rate limiting
Fix: Updated retry logic with exponential backoff
Prevention: Added rate limit monitoring
Post-mortem: Scheduled for 2025-07-19 14:00 UTC

This monitoring and observability strategy provides comprehensive visibility into the partner integration system, enabling proactive issue detection and rapid incident response.