agmission/Development/server/docs/MONITORING_GUIDE.md

953 lines
24 KiB
Markdown

# Simplified Partner System Monitoring Guide
## Overview
This guide provides a lightweight monitoring approach for the partner integration system, focusing on essential health checks and logging without complex infrastructure requirements.
## Basic Monitoring Strategy
### 1. Essential Health Checks
#### Simple Health Check Implementation
```javascript
// File: services/monitoring/SimpleHealthChecker.js
class SimpleHealthChecker {
constructor() {
this.partnerConfig = require('../helpers/partner_config');
}
async performBasicHealthCheck() {
const results = {
timestamp: new Date(),
overall: 'healthy',
components: {}
};
try {
// Check database connectivity
results.components.database = await this.checkDatabase();
// Check partner system users
results.components.partnerUsers = await this.checkPartnerUsers();
// Check application health
results.components.application = this.checkApplication();
// Determine overall health
results.overall = this.calculateOverallHealth(results.components);
} catch (error) {
results.overall = 'unhealthy';
results.error = error.message;
}
return results;
}
async checkDatabase() {
try {
const start = Date.now();
await require('mongoose').connection.db.admin().ping();
return {
status: 'healthy',
responseTime: Date.now() - start
};
} catch (error) {
return {
status: 'unhealthy',
error: error.message
};
}
}
async checkPartnerUsers() {
try {
const { PartnerSystemUser } = require('../model/partner');
const activeUsers = await PartnerSystemUser.countDocuments({ active: true });
const errorUsers = await PartnerSystemUser.countDocuments({ syncStatus: 'error' });
return {
status: errorUsers < activeUsers * 0.5 ? 'healthy' : 'degraded',
details: {
activeUsers,
errorUsers,
errorRate: activeUsers > 0 ? (errorUsers / activeUsers * 100).toFixed(2) : 0
}
};
} catch (error) {
return {
status: 'unhealthy',
error: error.message
};
}
}
checkApplication() {
const memoryUsage = process.memoryUsage();
const uptime = process.uptime();
const memoryThreshold = 1024 * 1024 * 1024; // 1GB
return {
status: memoryUsage.heapUsed < memoryThreshold ? 'healthy' : 'degraded',
details: {
memoryUsedMB: Math.round(memoryUsage.heapUsed / 1024 / 1024),
uptimeHours: Math.round(uptime / 3600 * 100) / 100
}
};
}
calculateOverallHealth(components) {
const statuses = Object.values(components).map(c => c.status);
if (statuses.includes('unhealthy')) {
return 'unhealthy';
} else if (statuses.includes('degraded')) {
return 'degraded';
} else {
return 'healthy';
}
}
// Express middleware for health check endpoint
healthCheckMiddleware() {
return async (req, res) => {
const health = await this.performBasicHealthCheck();
const statusCode = health.overall === 'healthy' ? 200 :
health.overall === 'degraded' ? 200 : 503;
res.status(statusCode).json(health);
};
}
}
module.exports = new SimpleHealthChecker();
```
### 2. Basic Logging Strategy
#### Simple Logger Implementation
```javascript
// File: services/monitoring/SimpleLogger.js
const winston = require('winston');
class SimpleLogger {
constructor() {
this.logger = winston.createLogger({
level: process.env.LOG_LEVEL || 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.json()
),
transports: [
new winston.transports.File({
filename: 'logs/partner-errors.log',
level: 'error'
}),
new winston.transports.File({
filename: 'logs/partner-activity.log'
})
]
});
if (process.env.NODE_ENV !== 'production') {
this.logger.add(new winston.transports.Console({
format: winston.format.simple()
}));
}
}
// Log partner operations (success/failure)
logPartnerOperation(operation, partner, success, metadata = {}) {
const logData = {
operation,
partner,
success,
timestamp: new Date(),
...metadata
};
if (success) {
this.logger.info('Partner operation completed', logData);
} else {
this.logger.error('Partner operation failed', logData);
}
}
// Log critical errors
logError(error, context = {}) {
this.logger.error('Partner system error', {
message: error.message,
stack: error.stack,
context,
timestamp: new Date()
});
}
// Log sync activities
logSync(customerId, partnerId, operation, status, metadata = {}) {
this.logger.info('Partner sync activity', {
customerId,
partnerId,
operation,
status,
metadata,
timestamp: new Date()
});
}
}
module.exports = new SimpleLogger();
```
## Key Metrics to Monitor
### 1. Partner System User Health
- Active partner system users count
- Error rate per partner
- Last successful sync per customer
### 2. API Call Success Rates
- Successful vs failed API calls per partner
- Response times (basic timing)
- Authentication failures
### 3. Application Health
- Memory usage
- Uptime
- Database connectivity
## Simple Alerting
### Email Alerts for Critical Issues
```javascript
// File: services/monitoring/SimpleAlerting.js
const nodemailer = require('nodemailer');
class SimpleAlerting {
constructor() {
this.transporter = nodemailer.createTransporter({
host: process.env.SMTP_HOST,
port: process.env.SMTP_PORT,
secure: false,
auth: {
user: process.env.SMTP_USER,
pass: process.env.SMTP_PASS
}
});
this.alertEmails = (process.env.ALERT_EMAILS || '').split(',');
}
async sendAlert(subject, message, severity = 'warning') {
if (!this.alertEmails.length) return;
const emailContent = {
from: process.env.ALERT_FROM_EMAIL,
to: this.alertEmails.join(','),
subject: `[${severity.toUpperCase()}] ${subject}`,
text: message,
html: `<pre>${message}</pre>`
};
try {
await this.transporter.sendMail(emailContent);
} catch (error) {
console.error('Failed to send alert email:', error);
}
}
// Alert when partner sync fails repeatedly
async alertPartnerSyncFailure(customerId, partnerId, errorCount) {
if (errorCount >= 5) {
await this.sendAlert(
'Partner Sync Failure',
`Customer ${customerId} has failed to sync with partner ${partnerId} ${errorCount} times consecutively.`,
'critical'
);
}
}
// Alert when partner API is down
async alertPartnerDown(partnerCode, error) {
await this.sendAlert(
'Partner API Down',
`Partner ${partnerCode} API is not responding: ${error}`,
'critical'
);
}
}
module.exports = new SimpleAlerting();
```
## Dashboard Implementation
### Simple HTML Dashboard
```html
<!-- File: public/partner-dashboard.html -->
<!DOCTYPE html>
<html>
<head>
<title>Partner Integration Dashboard</title>
<style>
.status-healthy { color: green; }
.status-degraded { color: orange; }
.status-unhealthy { color: red; }
.card { border: 1px solid #ddd; margin: 10px; padding: 15px; }
</style>
</head>
<body>
<h1>Partner Integration Dashboard</h1>
<div id="health-status" class="card">
<h2>System Health</h2>
<div id="health-content">Loading...</div>
</div>
<div id="partner-stats" class="card">
<h2>Partner Statistics</h2>
<div id="stats-content">Loading...</div>
</div>
<script>
async function loadHealthStatus() {
try {
const response = await fetch('/api/health');
const health = await response.json();
document.getElementById('health-content').innerHTML = `
<p>Overall Status: <span class="status-${health.overall}">${health.overall}</span></p>
<p>Database: <span class="status-${health.components.database.status}">${health.components.database.status}</span></p>
<p>Partner Users: <span class="status-${health.components.partnerUsers.status}">${health.components.partnerUsers.status}</span></p>
<p>Application: <span class="status-${health.components.application.status}">${health.components.application.status}</span></p>
`;
} catch (error) {
document.getElementById('health-content').innerHTML = 'Error loading health status';
}
}
async function loadPartnerStats() {
try {
const response = await fetch('/api/partners/stats');
const stats = await response.json();
document.getElementById('stats-content').innerHTML = `
<p>Total Partners: ${stats.totalPartners}</p>
<p>Active Partner Users: ${stats.activePartnerUsers}</p>
<p>Recent Sync Errors: ${stats.recentErrors}</p>
`;
} catch (error) {
document.getElementById('stats-content').innerHTML = 'Error loading partner stats';
}
}
// Auto-refresh every 30 seconds
setInterval(() => {
loadHealthStatus();
loadPartnerStats();
}, 30000);
// Initial load
loadHealthStatus();
loadPartnerStats();
</script>
</body>
</html>
```
This simplified monitoring approach provides essential visibility without requiring complex infrastructure like Grafana, Prometheus, or extensive logging systems.
timestamp: new Date(),
overall: 'healthy',
components: {},
duration: 0
};
try {
// Check database connectivity
results.components.database = await this.checkDatabase();
// Check queue system
results.components.queues = await this.checkQueues();
// Check partner connectivity
results.components.partners = await this.checkPartners();
// Check application health
results.components.application = await this.checkApplication();
// Determine overall health
results.overall = this.calculateOverallHealth(results.components);
} catch (error) {
results.overall = 'unhealthy';
results.error = error.message;
}
results.duration = Date.now() - startTime;
this.healthStatus = results;
return results;
}
async checkDatabase() {
try {
const start = Date.now();
await this.database.db.admin().ping();
return {
status: 'healthy',
responseTime: Date.now() - start,
details: {
connected: true,
readyState: this.database.connection.readyState
}
};
} catch (error) {
return {
status: 'unhealthy',
error: error.message
};
}
}
async checkQueues() {
try {
const stats = await this.queueSystem.getQueueStats();
const totalPending = Object.values(stats).reduce((sum, queue) =>
sum + queue.waiting + queue.active, 0);
const isHealthy = totalPending < 1000; // Threshold for healthy queue
return {
status: isHealthy ? 'healthy' : 'degraded',
details: {
totalPending,
stats
}
};
} catch (error) {
return {
status: 'unhealthy',
error: error.message
};
}
}
async checkPartners() {
const partnerResults = {};
const partners = this.partnerRegistry.getAll();
for (const partner of partners) {
try {
const result = await partner.healthCheck();
partnerResults[partner.getPartnerType()] = result;
} catch (error) {
partnerResults[partner.getPartnerType()] = {
status: 'unhealthy',
error: error.message
};
}
}
const healthyPartners = Object.values(partnerResults)
.filter(p => p.status === 'healthy').length;
const totalPartners = Object.keys(partnerResults).length;
return {
status: healthyPartners > 0 ? 'healthy' : 'unhealthy',
healthyCount: healthyPartners,
totalCount: totalPartners,
details: partnerResults
};
}
async checkApplication() {
try {
const memoryUsage = process.memoryUsage();
const cpuUsage = process.cpuUsage();
const uptime = process.uptime();
const memoryThreshold = 1024 * 1024 * 1024; // 1GB
const isMemoryHealthy = memoryUsage.heapUsed < memoryThreshold;
return {
status: isMemoryHealthy ? 'healthy' : 'degraded',
details: {
memory: memoryUsage,
cpu: cpuUsage,
uptime,
nodeVersion: process.version
}
};
} catch (error) {
return {
status: 'unhealthy',
error: error.message
};
}
}
calculateOverallHealth(components) {
const statuses = Object.values(components).map(c => c.status);
if (statuses.includes('unhealthy')) {
return 'unhealthy';
} else if (statuses.includes('degraded')) {
return 'degraded';
} else {
return 'healthy';
}
}
getHealthStatus() {
return this.healthStatus;
}
// Express middleware for health check endpoint
healthCheckMiddleware() {
return async (req, res) => {
const health = await this.performHealthCheck();
const statusCode = health.overall === 'healthy' ? 200 :
health.overall === 'degraded' ? 200 : 503;
res.status(statusCode).json(health);
};
}
}
module.exports = HealthChecker;
```
### 3. Structured Logging
```javascript
// File: services/monitoring/Logger.js
const winston = require('winston');
class Logger {
constructor() {
this.logger = winston.createLogger({
level: process.env.LOG_LEVEL || 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
winston.format.json()
),
defaultMeta: {
service: 'agmission-partner-integration',
version: process.env.APP_VERSION || '1.0.0'
},
transports: [
new winston.transports.File({
filename: 'logs/error.log',
level: 'error'
}),
new winston.transports.File({
filename: 'logs/combined.log'
})
]
});
if (process.env.NODE_ENV !== 'production') {
this.logger.add(new winston.transports.Console({
format: winston.format.simple()
}));
}
}
// Partner operation logging
logPartnerOperation(operation, partner, data, duration, success = true) {
const logData = {
operation,
partner,
duration,
success,
timestamp: new Date(),
...data
};
if (success) {
this.logger.info('Partner operation completed', logData);
} else {
this.logger.error('Partner operation failed', logData);
}
}
// Assignment logging
logAssignment(jobId, userId, partnerType, status, metadata = {}) {
this.logger.info('Job assignment', {
jobId,
userId,
partnerType,
status,
metadata,
timestamp: new Date()
});
}
// Sync operation logging
logSync(assignmentId, operation, status, duration, metadata = {}) {
const logData = {
assignmentId,
operation,
status,
duration,
metadata,
timestamp: new Date()
};
if (status === 'success') {
this.logger.info('Sync operation completed', logData);
} else {
this.logger.error('Sync operation failed', logData);
}
}
// Data processing logging
logDataProcessing(applicationId, stage, status, metadata = {}) {
this.logger.info('Data processing stage', {
applicationId,
stage,
status,
metadata,
timestamp: new Date()
});
}
// Error logging with context
logError(error, context = {}) {
this.logger.error('Application error', {
message: error.message,
stack: error.stack,
context,
timestamp: new Date()
});
}
// Performance logging
logPerformance(operation, duration, metadata = {}) {
this.logger.info('Performance metric', {
operation,
duration,
metadata,
timestamp: new Date()
});
}
}
module.exports = new Logger();
```
## Dashboard Configuration
### 1. Grafana Dashboard JSON
```json
{
"dashboard": {
"id": null,
"title": "Partner Integration Monitoring",
"tags": ["agmission", "partners"],
"timezone": "UTC",
"panels": [
{
"id": 1,
"title": "Partner API Response Times",
"type": "stat",
"targets": [
{
"expr": "avg(partner_api_duration_seconds) by (partner)",
"legendFormat": "{{partner}} avg"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 2},
{"color": "red", "value": 5}
]
}
}
}
},
{
"id": 2,
"title": "Queue Depth",
"type": "graph",
"targets": [
{
"expr": "queue_depth",
"legendFormat": "{{queue_type}} - {{partner}}"
}
]
},
{
"id": 3,
"title": "Assignment Success Rate",
"type": "stat",
"targets": [
{
"expr": "rate(job_assignments_total{status=\"success\"}[5m]) / rate(job_assignments_total[5m]) * 100",
"legendFormat": "Success Rate %"
}
]
},
{
"id": 4,
"title": "Error Rates by Partner",
"type": "graph",
"targets": [
{
"expr": "rate(partner_api_requests_total{status!=\"success\"}[5m])",
"legendFormat": "{{partner}} errors/sec"
}
]
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "30s"
}
}
```
### 2. Alert Rules Configuration
```yaml
# File: monitoring/alerts.yml
groups:
- name: partner_integration
rules:
- alert: PartnerAPIHighErrorRate
expr: rate(partner_api_requests_total{status!="success"}[5m]) > 0.1
for: 2m
labels:
severity: warning
team: integration
annotations:
summary: "High error rate for partner {{ $labels.partner }}"
description: "Partner {{ $labels.partner }} has error rate of {{ $value }} errors/sec"
- alert: PartnerAPIDown
expr: up{job="partner_api"} == 0
for: 1m
labels:
severity: critical
team: integration
annotations:
summary: "Partner API {{ $labels.partner }} is down"
description: "Partner {{ $labels.partner }} API has been unreachable for over 1 minute"
- alert: QueueDepthHigh
expr: queue_depth > 1000
for: 5m
labels:
severity: warning
team: integration
annotations:
summary: "High queue depth for {{ $labels.queue_type }}"
description: "Queue {{ $labels.queue_type }} has {{ $value }} pending jobs"
- alert: SyncFailureRateHigh
expr: rate(sync_duration_seconds{status="failed"}[10m]) > 0.05
for: 5m
labels:
severity: warning
team: integration
annotations:
summary: "High sync failure rate for {{ $labels.partner }}"
description: "Partner {{ $labels.partner }} sync failure rate is {{ $value }}"
- alert: DataProcessingStuck
expr: increase(data_processing_duration_seconds_count[1h]) == 0
for: 30m
labels:
severity: critical
team: integration
annotations:
summary: "Data processing appears stuck"
description: "No data processing completions in the last hour"
```
## Troubleshooting Playbook
### 1. Partner API Issues
#### Symptoms
- High error rates in partner API calls
- Timeouts or connection failures
- Authentication errors
#### Investigation Steps
1. Check partner API status dashboard
2. Verify network connectivity to partner
3. Validate API credentials and tokens
4. Review partner API rate limits
5. Check partner service logs
#### Resolution Actions
```bash
# Check partner connectivity
curl -H "Authorization: Bearer $TOKEN" https://api.partner.com/health
# Review recent errors
kubectl logs -f deployment/agmission-api | grep "partner.*error"
# Restart partner service if needed
kubectl rollout restart deployment/agmission-api
```
### 2. Queue Processing Issues
#### Symptoms
- Increasing queue depth
- Jobs stuck in pending status
- Worker processes crashing
#### Investigation Steps
1. Check queue worker health and logs
2. Verify Redis connectivity
3. Monitor memory and CPU usage
4. Check for deadlocks in job processing
#### Resolution Actions
```bash
# Check queue status
redis-cli -h redis-host info keyspace
# Restart queue workers
pm2 restart partner-sync-worker
# Clear stuck jobs (use with caution)
redis-cli -h redis-host flushdb
```
### 3. Data Sync Issues
#### Symptoms
- Jobs not syncing to partners
- Data not being retrieved from partners
- Sync operations timing out
#### Investigation Steps
1. Check assignment sync states in database
2. Verify partner job creation
3. Review sync queue processing
4. Check for configuration issues
#### Resolution Actions
```javascript
// Manual sync trigger via API
POST /api/sync/assignments/{assignmentId}/sync
{
"operation": "job_upload",
"force": true
}
// Database query to check sync status
db.job_assigns.find({
"syncState.jobUpload.status": "failed",
"partnerType": "satloc"
}).limit(10);
```
### 4. Performance Issues
#### Symptoms
- Slow response times
- High memory usage
- Database query timeouts
#### Investigation Steps
1. Monitor application performance metrics
2. Check database query performance
3. Review memory and CPU utilization
4. Analyze slow API endpoints
#### Resolution Actions
```bash
# Scale up application pods
kubectl scale deployment agmission-api --replicas=3
# Check database performance
db.runCommand({currentOp: 1, "secs_running": {$gte: 5}})
# Monitor memory usage
kubectl top pods -l app=agmission-api
```
## Incident Response Procedures
### 1. Incident Classification
| Severity | Response Time | Example Issues |
|----------|---------------|----------------|
| P0 - Critical | 15 minutes | Complete system outage, data loss |
| P1 - High | 1 hour | Partner integration down, major feature broken |
| P2 - Medium | 4 hours | Performance degradation, minor feature issues |
| P3 - Low | 24 hours | Cosmetic issues, non-critical bugs |
### 2. Escalation Matrix
```
Level 1: On-call Engineer
├── Initial response and investigation
├── Follow standard playbooks
└── Escalate to Level 2 if needed
Level 2: Senior Engineer + Team Lead
├── Complex troubleshooting
├── Architecture decisions
└── Escalate to Level 3 if needed
Level 3: Architect + Management
├── System design issues
├── Business impact decisions
└── External vendor coordination
```
### 3. Communication Templates
#### Initial Alert
```
🚨 INCIDENT: Partner Integration Issue
Severity: P1
Impact: Satloc jobs not syncing
Started: 2025-07-18 10:30 UTC
Assigned: @engineer-oncall
Status: Investigating
Next Update: 10:45 UTC
```
#### Status Update
```
📊 UPDATE: Partner Integration Issue
Severity: P1
Root Cause: Partner API rate limiting
Action: Implementing exponential backoff
ETA: 11:00 UTC for resolution
Next Update: 11:00 UTC
```
#### Resolution Notice
```
✅ RESOLVED: Partner Integration Issue
Duration: 30 minutes
Root Cause: Partner API rate limiting
Fix: Updated retry logic with exponential backoff
Prevention: Added rate limit monitoring
Post-mortem: Scheduled for 2025-07-19 14:00 UTC
```
This monitoring and observability strategy provides comprehensive visibility into the partner integration system, enabling proactive issue detection and rapid incident response.