24 KiB
Simplified Partner System Monitoring Guide
Overview
This guide provides a lightweight monitoring approach for the partner integration system, focusing on essential health checks and logging without complex infrastructure requirements.
Basic Monitoring Strategy
1. Essential Health Checks
Simple Health Check Implementation
// File: services/monitoring/SimpleHealthChecker.js
class SimpleHealthChecker {
constructor() {
this.partnerConfig = require('../helpers/partner_config');
}
async performBasicHealthCheck() {
const results = {
timestamp: new Date(),
overall: 'healthy',
components: {}
};
try {
// Check database connectivity
results.components.database = await this.checkDatabase();
// Check partner system users
results.components.partnerUsers = await this.checkPartnerUsers();
// Check application health
results.components.application = this.checkApplication();
// Determine overall health
results.overall = this.calculateOverallHealth(results.components);
} catch (error) {
results.overall = 'unhealthy';
results.error = error.message;
}
return results;
}
async checkDatabase() {
try {
const start = Date.now();
await require('mongoose').connection.db.admin().ping();
return {
status: 'healthy',
responseTime: Date.now() - start
};
} catch (error) {
return {
status: 'unhealthy',
error: error.message
};
}
}
async checkPartnerUsers() {
try {
const { PartnerSystemUser } = require('../model/partner');
const activeUsers = await PartnerSystemUser.countDocuments({ active: true });
const errorUsers = await PartnerSystemUser.countDocuments({ syncStatus: 'error' });
return {
status: errorUsers < activeUsers * 0.5 ? 'healthy' : 'degraded',
details: {
activeUsers,
errorUsers,
errorRate: activeUsers > 0 ? (errorUsers / activeUsers * 100).toFixed(2) : 0
}
};
} catch (error) {
return {
status: 'unhealthy',
error: error.message
};
}
}
checkApplication() {
const memoryUsage = process.memoryUsage();
const uptime = process.uptime();
const memoryThreshold = 1024 * 1024 * 1024; // 1GB
return {
status: memoryUsage.heapUsed < memoryThreshold ? 'healthy' : 'degraded',
details: {
memoryUsedMB: Math.round(memoryUsage.heapUsed / 1024 / 1024),
uptimeHours: Math.round(uptime / 3600 * 100) / 100
}
};
}
calculateOverallHealth(components) {
const statuses = Object.values(components).map(c => c.status);
if (statuses.includes('unhealthy')) {
return 'unhealthy';
} else if (statuses.includes('degraded')) {
return 'degraded';
} else {
return 'healthy';
}
}
// Express middleware for health check endpoint
healthCheckMiddleware() {
return async (req, res) => {
const health = await this.performBasicHealthCheck();
const statusCode = health.overall === 'healthy' ? 200 :
health.overall === 'degraded' ? 200 : 503;
res.status(statusCode).json(health);
};
}
}
module.exports = new SimpleHealthChecker();
2. Basic Logging Strategy
Simple Logger Implementation
// File: services/monitoring/SimpleLogger.js
const winston = require('winston');
class SimpleLogger {
constructor() {
this.logger = winston.createLogger({
level: process.env.LOG_LEVEL || 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.json()
),
transports: [
new winston.transports.File({
filename: 'logs/partner-errors.log',
level: 'error'
}),
new winston.transports.File({
filename: 'logs/partner-activity.log'
})
]
});
if (process.env.NODE_ENV !== 'production') {
this.logger.add(new winston.transports.Console({
format: winston.format.simple()
}));
}
}
// Log partner operations (success/failure)
logPartnerOperation(operation, partner, success, metadata = {}) {
const logData = {
operation,
partner,
success,
timestamp: new Date(),
...metadata
};
if (success) {
this.logger.info('Partner operation completed', logData);
} else {
this.logger.error('Partner operation failed', logData);
}
}
// Log critical errors
logError(error, context = {}) {
this.logger.error('Partner system error', {
message: error.message,
stack: error.stack,
context,
timestamp: new Date()
});
}
// Log sync activities
logSync(customerId, partnerId, operation, status, metadata = {}) {
this.logger.info('Partner sync activity', {
customerId,
partnerId,
operation,
status,
metadata,
timestamp: new Date()
});
}
}
module.exports = new SimpleLogger();
Key Metrics to Monitor
1. Partner System User Health
- Active partner system users count
- Error rate per partner
- Last successful sync per customer
2. API Call Success Rates
- Successful vs failed API calls per partner
- Response times (basic timing)
- Authentication failures
3. Application Health
- Memory usage
- Uptime
- Database connectivity
Simple Alerting
Email Alerts for Critical Issues
// File: services/monitoring/SimpleAlerting.js
const nodemailer = require('nodemailer');
class SimpleAlerting {
constructor() {
this.transporter = nodemailer.createTransporter({
host: process.env.SMTP_HOST,
port: process.env.SMTP_PORT,
secure: false,
auth: {
user: process.env.SMTP_USER,
pass: process.env.SMTP_PASS
}
});
this.alertEmails = (process.env.ALERT_EMAILS || '').split(',');
}
async sendAlert(subject, message, severity = 'warning') {
if (!this.alertEmails.length) return;
const emailContent = {
from: process.env.ALERT_FROM_EMAIL,
to: this.alertEmails.join(','),
subject: `[${severity.toUpperCase()}] ${subject}`,
text: message,
html: `<pre>${message}</pre>`
};
try {
await this.transporter.sendMail(emailContent);
} catch (error) {
console.error('Failed to send alert email:', error);
}
}
// Alert when partner sync fails repeatedly
async alertPartnerSyncFailure(customerId, partnerId, errorCount) {
if (errorCount >= 5) {
await this.sendAlert(
'Partner Sync Failure',
`Customer ${customerId} has failed to sync with partner ${partnerId} ${errorCount} times consecutively.`,
'critical'
);
}
}
// Alert when partner API is down
async alertPartnerDown(partnerCode, error) {
await this.sendAlert(
'Partner API Down',
`Partner ${partnerCode} API is not responding: ${error}`,
'critical'
);
}
}
module.exports = new SimpleAlerting();
Dashboard Implementation
Simple HTML Dashboard
<!-- File: public/partner-dashboard.html -->
<!DOCTYPE html>
<html>
<head>
<title>Partner Integration Dashboard</title>
<style>
.status-healthy { color: green; }
.status-degraded { color: orange; }
.status-unhealthy { color: red; }
.card { border: 1px solid #ddd; margin: 10px; padding: 15px; }
</style>
</head>
<body>
<h1>Partner Integration Dashboard</h1>
<div id="health-status" class="card">
<h2>System Health</h2>
<div id="health-content">Loading...</div>
</div>
<div id="partner-stats" class="card">
<h2>Partner Statistics</h2>
<div id="stats-content">Loading...</div>
</div>
<script>
async function loadHealthStatus() {
try {
const response = await fetch('/api/health');
const health = await response.json();
document.getElementById('health-content').innerHTML = `
<p>Overall Status: <span class="status-${health.overall}">${health.overall}</span></p>
<p>Database: <span class="status-${health.components.database.status}">${health.components.database.status}</span></p>
<p>Partner Users: <span class="status-${health.components.partnerUsers.status}">${health.components.partnerUsers.status}</span></p>
<p>Application: <span class="status-${health.components.application.status}">${health.components.application.status}</span></p>
`;
} catch (error) {
document.getElementById('health-content').innerHTML = 'Error loading health status';
}
}
async function loadPartnerStats() {
try {
const response = await fetch('/api/partners/stats');
const stats = await response.json();
document.getElementById('stats-content').innerHTML = `
<p>Total Partners: ${stats.totalPartners}</p>
<p>Active Partner Users: ${stats.activePartnerUsers}</p>
<p>Recent Sync Errors: ${stats.recentErrors}</p>
`;
} catch (error) {
document.getElementById('stats-content').innerHTML = 'Error loading partner stats';
}
}
// Auto-refresh every 30 seconds
setInterval(() => {
loadHealthStatus();
loadPartnerStats();
}, 30000);
// Initial load
loadHealthStatus();
loadPartnerStats();
</script>
</body>
</html>
This simplified monitoring approach provides essential visibility without requiring complex infrastructure like Grafana, Prometheus, or extensive logging systems. timestamp: new Date(), overall: 'healthy', components: {}, duration: 0 };
try {
// Check database connectivity
results.components.database = await this.checkDatabase();
// Check queue system
results.components.queues = await this.checkQueues();
// Check partner connectivity
results.components.partners = await this.checkPartners();
// Check application health
results.components.application = await this.checkApplication();
// Determine overall health
results.overall = this.calculateOverallHealth(results.components);
} catch (error) {
results.overall = 'unhealthy';
results.error = error.message;
}
results.duration = Date.now() - startTime;
this.healthStatus = results;
return results;
}
async checkDatabase() { try { const start = Date.now(); await this.database.db.admin().ping();
return {
status: 'healthy',
responseTime: Date.now() - start,
details: {
connected: true,
readyState: this.database.connection.readyState
}
};
} catch (error) {
return {
status: 'unhealthy',
error: error.message
};
}
}
async checkQueues() { try { const stats = await this.queueSystem.getQueueStats(); const totalPending = Object.values(stats).reduce((sum, queue) => sum + queue.waiting + queue.active, 0);
const isHealthy = totalPending < 1000; // Threshold for healthy queue
return {
status: isHealthy ? 'healthy' : 'degraded',
details: {
totalPending,
stats
}
};
} catch (error) {
return {
status: 'unhealthy',
error: error.message
};
}
}
async checkPartners() { const partnerResults = {}; const partners = this.partnerRegistry.getAll();
for (const partner of partners) {
try {
const result = await partner.healthCheck();
partnerResults[partner.getPartnerType()] = result;
} catch (error) {
partnerResults[partner.getPartnerType()] = {
status: 'unhealthy',
error: error.message
};
}
}
const healthyPartners = Object.values(partnerResults)
.filter(p => p.status === 'healthy').length;
const totalPartners = Object.keys(partnerResults).length;
return {
status: healthyPartners > 0 ? 'healthy' : 'unhealthy',
healthyCount: healthyPartners,
totalCount: totalPartners,
details: partnerResults
};
}
async checkApplication() { try { const memoryUsage = process.memoryUsage(); const cpuUsage = process.cpuUsage(); const uptime = process.uptime();
const memoryThreshold = 1024 * 1024 * 1024; // 1GB
const isMemoryHealthy = memoryUsage.heapUsed < memoryThreshold;
return {
status: isMemoryHealthy ? 'healthy' : 'degraded',
details: {
memory: memoryUsage,
cpu: cpuUsage,
uptime,
nodeVersion: process.version
}
};
} catch (error) {
return {
status: 'unhealthy',
error: error.message
};
}
}
calculateOverallHealth(components) { const statuses = Object.values(components).map(c => c.status);
if (statuses.includes('unhealthy')) {
return 'unhealthy';
} else if (statuses.includes('degraded')) {
return 'degraded';
} else {
return 'healthy';
}
}
getHealthStatus() { return this.healthStatus; }
// Express middleware for health check endpoint healthCheckMiddleware() { return async (req, res) => { const health = await this.performHealthCheck(); const statusCode = health.overall === 'healthy' ? 200 : health.overall === 'degraded' ? 200 : 503;
res.status(statusCode).json(health);
};
} }
module.exports = HealthChecker;
### 3. Structured Logging
```javascript
// File: services/monitoring/Logger.js
const winston = require('winston');
class Logger {
constructor() {
this.logger = winston.createLogger({
level: process.env.LOG_LEVEL || 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
winston.format.json()
),
defaultMeta: {
service: 'agmission-partner-integration',
version: process.env.APP_VERSION || '1.0.0'
},
transports: [
new winston.transports.File({
filename: 'logs/error.log',
level: 'error'
}),
new winston.transports.File({
filename: 'logs/combined.log'
})
]
});
if (process.env.NODE_ENV !== 'production') {
this.logger.add(new winston.transports.Console({
format: winston.format.simple()
}));
}
}
// Partner operation logging
logPartnerOperation(operation, partner, data, duration, success = true) {
const logData = {
operation,
partner,
duration,
success,
timestamp: new Date(),
...data
};
if (success) {
this.logger.info('Partner operation completed', logData);
} else {
this.logger.error('Partner operation failed', logData);
}
}
// Assignment logging
logAssignment(jobId, userId, partnerType, status, metadata = {}) {
this.logger.info('Job assignment', {
jobId,
userId,
partnerType,
status,
metadata,
timestamp: new Date()
});
}
// Sync operation logging
logSync(assignmentId, operation, status, duration, metadata = {}) {
const logData = {
assignmentId,
operation,
status,
duration,
metadata,
timestamp: new Date()
};
if (status === 'success') {
this.logger.info('Sync operation completed', logData);
} else {
this.logger.error('Sync operation failed', logData);
}
}
// Data processing logging
logDataProcessing(applicationId, stage, status, metadata = {}) {
this.logger.info('Data processing stage', {
applicationId,
stage,
status,
metadata,
timestamp: new Date()
});
}
// Error logging with context
logError(error, context = {}) {
this.logger.error('Application error', {
message: error.message,
stack: error.stack,
context,
timestamp: new Date()
});
}
// Performance logging
logPerformance(operation, duration, metadata = {}) {
this.logger.info('Performance metric', {
operation,
duration,
metadata,
timestamp: new Date()
});
}
}
module.exports = new Logger();
Dashboard Configuration
1. Grafana Dashboard JSON
{
"dashboard": {
"id": null,
"title": "Partner Integration Monitoring",
"tags": ["agmission", "partners"],
"timezone": "UTC",
"panels": [
{
"id": 1,
"title": "Partner API Response Times",
"type": "stat",
"targets": [
{
"expr": "avg(partner_api_duration_seconds) by (partner)",
"legendFormat": "{{partner}} avg"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 2},
{"color": "red", "value": 5}
]
}
}
}
},
{
"id": 2,
"title": "Queue Depth",
"type": "graph",
"targets": [
{
"expr": "queue_depth",
"legendFormat": "{{queue_type}} - {{partner}}"
}
]
},
{
"id": 3,
"title": "Assignment Success Rate",
"type": "stat",
"targets": [
{
"expr": "rate(job_assignments_total{status=\"success\"}[5m]) / rate(job_assignments_total[5m]) * 100",
"legendFormat": "Success Rate %"
}
]
},
{
"id": 4,
"title": "Error Rates by Partner",
"type": "graph",
"targets": [
{
"expr": "rate(partner_api_requests_total{status!=\"success\"}[5m])",
"legendFormat": "{{partner}} errors/sec"
}
]
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "30s"
}
}
2. Alert Rules Configuration
# File: monitoring/alerts.yml
groups:
- name: partner_integration
rules:
- alert: PartnerAPIHighErrorRate
expr: rate(partner_api_requests_total{status!="success"}[5m]) > 0.1
for: 2m
labels:
severity: warning
team: integration
annotations:
summary: "High error rate for partner {{ $labels.partner }}"
description: "Partner {{ $labels.partner }} has error rate of {{ $value }} errors/sec"
- alert: PartnerAPIDown
expr: up{job="partner_api"} == 0
for: 1m
labels:
severity: critical
team: integration
annotations:
summary: "Partner API {{ $labels.partner }} is down"
description: "Partner {{ $labels.partner }} API has been unreachable for over 1 minute"
- alert: QueueDepthHigh
expr: queue_depth > 1000
for: 5m
labels:
severity: warning
team: integration
annotations:
summary: "High queue depth for {{ $labels.queue_type }}"
description: "Queue {{ $labels.queue_type }} has {{ $value }} pending jobs"
- alert: SyncFailureRateHigh
expr: rate(sync_duration_seconds{status="failed"}[10m]) > 0.05
for: 5m
labels:
severity: warning
team: integration
annotations:
summary: "High sync failure rate for {{ $labels.partner }}"
description: "Partner {{ $labels.partner }} sync failure rate is {{ $value }}"
- alert: DataProcessingStuck
expr: increase(data_processing_duration_seconds_count[1h]) == 0
for: 30m
labels:
severity: critical
team: integration
annotations:
summary: "Data processing appears stuck"
description: "No data processing completions in the last hour"
Troubleshooting Playbook
1. Partner API Issues
Symptoms
- High error rates in partner API calls
- Timeouts or connection failures
- Authentication errors
Investigation Steps
- Check partner API status dashboard
- Verify network connectivity to partner
- Validate API credentials and tokens
- Review partner API rate limits
- Check partner service logs
Resolution Actions
# Check partner connectivity
curl -H "Authorization: Bearer $TOKEN" https://api.partner.com/health
# Review recent errors
kubectl logs -f deployment/agmission-api | grep "partner.*error"
# Restart partner service if needed
kubectl rollout restart deployment/agmission-api
2. Queue Processing Issues
Symptoms
- Increasing queue depth
- Jobs stuck in pending status
- Worker processes crashing
Investigation Steps
- Check queue worker health and logs
- Verify Redis connectivity
- Monitor memory and CPU usage
- Check for deadlocks in job processing
Resolution Actions
# Check queue status
redis-cli -h redis-host info keyspace
# Restart queue workers
pm2 restart partner-sync-worker
# Clear stuck jobs (use with caution)
redis-cli -h redis-host flushdb
3. Data Sync Issues
Symptoms
- Jobs not syncing to partners
- Data not being retrieved from partners
- Sync operations timing out
Investigation Steps
- Check assignment sync states in database
- Verify partner job creation
- Review sync queue processing
- Check for configuration issues
Resolution Actions
// Manual sync trigger via API
POST /api/sync/assignments/{assignmentId}/sync
{
"operation": "job_upload",
"force": true
}
// Database query to check sync status
db.job_assigns.find({
"syncState.jobUpload.status": "failed",
"partnerType": "satloc"
}).limit(10);
4. Performance Issues
Symptoms
- Slow response times
- High memory usage
- Database query timeouts
Investigation Steps
- Monitor application performance metrics
- Check database query performance
- Review memory and CPU utilization
- Analyze slow API endpoints
Resolution Actions
# Scale up application pods
kubectl scale deployment agmission-api --replicas=3
# Check database performance
db.runCommand({currentOp: 1, "secs_running": {$gte: 5}})
# Monitor memory usage
kubectl top pods -l app=agmission-api
Incident Response Procedures
1. Incident Classification
| Severity | Response Time | Example Issues |
|---|---|---|
| P0 - Critical | 15 minutes | Complete system outage, data loss |
| P1 - High | 1 hour | Partner integration down, major feature broken |
| P2 - Medium | 4 hours | Performance degradation, minor feature issues |
| P3 - Low | 24 hours | Cosmetic issues, non-critical bugs |
2. Escalation Matrix
Level 1: On-call Engineer
├── Initial response and investigation
├── Follow standard playbooks
└── Escalate to Level 2 if needed
Level 2: Senior Engineer + Team Lead
├── Complex troubleshooting
├── Architecture decisions
└── Escalate to Level 3 if needed
Level 3: Architect + Management
├── System design issues
├── Business impact decisions
└── External vendor coordination
3. Communication Templates
Initial Alert
🚨 INCIDENT: Partner Integration Issue
Severity: P1
Impact: Satloc jobs not syncing
Started: 2025-07-18 10:30 UTC
Assigned: @engineer-oncall
Status: Investigating
Next Update: 10:45 UTC
Status Update
📊 UPDATE: Partner Integration Issue
Severity: P1
Root Cause: Partner API rate limiting
Action: Implementing exponential backoff
ETA: 11:00 UTC for resolution
Next Update: 11:00 UTC
Resolution Notice
✅ RESOLVED: Partner Integration Issue
Duration: 30 minutes
Root Cause: Partner API rate limiting
Fix: Updated retry logic with exponential backoff
Prevention: Added rate limit monitoring
Post-mortem: Scheduled for 2025-07-19 14:00 UTC
This monitoring and observability strategy provides comprehensive visibility into the partner integration system, enabling proactive issue detection and rapid incident response.