953 lines
24 KiB
Markdown
953 lines
24 KiB
Markdown
# Simplified Partner System Monitoring Guide
|
|
|
|
## Overview
|
|
|
|
This guide provides a lightweight monitoring approach for the partner integration system, focusing on essential health checks and logging without complex infrastructure requirements.
|
|
|
|
## Basic Monitoring Strategy
|
|
|
|
### 1. Essential Health Checks
|
|
|
|
#### Simple Health Check Implementation
|
|
|
|
```javascript
|
|
// File: services/monitoring/SimpleHealthChecker.js
|
|
class SimpleHealthChecker {
|
|
constructor() {
|
|
this.partnerConfig = require('../helpers/partner_config');
|
|
}
|
|
|
|
async performBasicHealthCheck() {
|
|
const results = {
|
|
timestamp: new Date(),
|
|
overall: 'healthy',
|
|
components: {}
|
|
};
|
|
|
|
try {
|
|
// Check database connectivity
|
|
results.components.database = await this.checkDatabase();
|
|
|
|
// Check partner system users
|
|
results.components.partnerUsers = await this.checkPartnerUsers();
|
|
|
|
// Check application health
|
|
results.components.application = this.checkApplication();
|
|
|
|
// Determine overall health
|
|
results.overall = this.calculateOverallHealth(results.components);
|
|
|
|
} catch (error) {
|
|
results.overall = 'unhealthy';
|
|
results.error = error.message;
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
async checkDatabase() {
|
|
try {
|
|
const start = Date.now();
|
|
await require('mongoose').connection.db.admin().ping();
|
|
|
|
return {
|
|
status: 'healthy',
|
|
responseTime: Date.now() - start
|
|
};
|
|
} catch (error) {
|
|
return {
|
|
status: 'unhealthy',
|
|
error: error.message
|
|
};
|
|
}
|
|
}
|
|
|
|
async checkPartnerUsers() {
|
|
try {
|
|
const { PartnerSystemUser } = require('../model/partner');
|
|
const activeUsers = await PartnerSystemUser.countDocuments({ active: true });
|
|
const errorUsers = await PartnerSystemUser.countDocuments({ syncStatus: 'error' });
|
|
|
|
return {
|
|
status: errorUsers < activeUsers * 0.5 ? 'healthy' : 'degraded',
|
|
details: {
|
|
activeUsers,
|
|
errorUsers,
|
|
errorRate: activeUsers > 0 ? (errorUsers / activeUsers * 100).toFixed(2) : 0
|
|
}
|
|
};
|
|
} catch (error) {
|
|
return {
|
|
status: 'unhealthy',
|
|
error: error.message
|
|
};
|
|
}
|
|
}
|
|
|
|
checkApplication() {
|
|
const memoryUsage = process.memoryUsage();
|
|
const uptime = process.uptime();
|
|
const memoryThreshold = 1024 * 1024 * 1024; // 1GB
|
|
|
|
return {
|
|
status: memoryUsage.heapUsed < memoryThreshold ? 'healthy' : 'degraded',
|
|
details: {
|
|
memoryUsedMB: Math.round(memoryUsage.heapUsed / 1024 / 1024),
|
|
uptimeHours: Math.round(uptime / 3600 * 100) / 100
|
|
}
|
|
};
|
|
}
|
|
|
|
calculateOverallHealth(components) {
|
|
const statuses = Object.values(components).map(c => c.status);
|
|
|
|
if (statuses.includes('unhealthy')) {
|
|
return 'unhealthy';
|
|
} else if (statuses.includes('degraded')) {
|
|
return 'degraded';
|
|
} else {
|
|
return 'healthy';
|
|
}
|
|
}
|
|
|
|
// Express middleware for health check endpoint
|
|
healthCheckMiddleware() {
|
|
return async (req, res) => {
|
|
const health = await this.performBasicHealthCheck();
|
|
const statusCode = health.overall === 'healthy' ? 200 :
|
|
health.overall === 'degraded' ? 200 : 503;
|
|
|
|
res.status(statusCode).json(health);
|
|
};
|
|
}
|
|
}
|
|
|
|
module.exports = new SimpleHealthChecker();
|
|
```
|
|
|
|
### 2. Basic Logging Strategy
|
|
|
|
#### Simple Logger Implementation
|
|
|
|
```javascript
|
|
// File: services/monitoring/SimpleLogger.js
|
|
const winston = require('winston');
|
|
|
|
class SimpleLogger {
|
|
constructor() {
|
|
this.logger = winston.createLogger({
|
|
level: process.env.LOG_LEVEL || 'info',
|
|
format: winston.format.combine(
|
|
winston.format.timestamp(),
|
|
winston.format.json()
|
|
),
|
|
transports: [
|
|
new winston.transports.File({
|
|
filename: 'logs/partner-errors.log',
|
|
level: 'error'
|
|
}),
|
|
new winston.transports.File({
|
|
filename: 'logs/partner-activity.log'
|
|
})
|
|
]
|
|
});
|
|
|
|
if (process.env.NODE_ENV !== 'production') {
|
|
this.logger.add(new winston.transports.Console({
|
|
format: winston.format.simple()
|
|
}));
|
|
}
|
|
}
|
|
|
|
// Log partner operations (success/failure)
|
|
logPartnerOperation(operation, partner, success, metadata = {}) {
|
|
const logData = {
|
|
operation,
|
|
partner,
|
|
success,
|
|
timestamp: new Date(),
|
|
...metadata
|
|
};
|
|
|
|
if (success) {
|
|
this.logger.info('Partner operation completed', logData);
|
|
} else {
|
|
this.logger.error('Partner operation failed', logData);
|
|
}
|
|
}
|
|
|
|
// Log critical errors
|
|
logError(error, context = {}) {
|
|
this.logger.error('Partner system error', {
|
|
message: error.message,
|
|
stack: error.stack,
|
|
context,
|
|
timestamp: new Date()
|
|
});
|
|
}
|
|
|
|
// Log sync activities
|
|
logSync(customerId, partnerId, operation, status, metadata = {}) {
|
|
this.logger.info('Partner sync activity', {
|
|
customerId,
|
|
partnerId,
|
|
operation,
|
|
status,
|
|
metadata,
|
|
timestamp: new Date()
|
|
});
|
|
}
|
|
}
|
|
|
|
module.exports = new SimpleLogger();
|
|
```
|
|
|
|
## Key Metrics to Monitor
|
|
|
|
### 1. Partner System User Health
|
|
- Active partner system users count
|
|
- Error rate per partner
|
|
- Last successful sync per customer
|
|
|
|
### 2. API Call Success Rates
|
|
- Successful vs failed API calls per partner
|
|
- Response times (basic timing)
|
|
- Authentication failures
|
|
|
|
### 3. Application Health
|
|
- Memory usage
|
|
- Uptime
|
|
- Database connectivity
|
|
|
|
## Simple Alerting
|
|
|
|
### Email Alerts for Critical Issues
|
|
|
|
```javascript
|
|
// File: services/monitoring/SimpleAlerting.js
|
|
const nodemailer = require('nodemailer');
|
|
|
|
class SimpleAlerting {
|
|
constructor() {
|
|
this.transporter = nodemailer.createTransporter({
|
|
host: process.env.SMTP_HOST,
|
|
port: process.env.SMTP_PORT,
|
|
secure: false,
|
|
auth: {
|
|
user: process.env.SMTP_USER,
|
|
pass: process.env.SMTP_PASS
|
|
}
|
|
});
|
|
|
|
this.alertEmails = (process.env.ALERT_EMAILS || '').split(',');
|
|
}
|
|
|
|
async sendAlert(subject, message, severity = 'warning') {
|
|
if (!this.alertEmails.length) return;
|
|
|
|
const emailContent = {
|
|
from: process.env.ALERT_FROM_EMAIL,
|
|
to: this.alertEmails.join(','),
|
|
subject: `[${severity.toUpperCase()}] ${subject}`,
|
|
text: message,
|
|
html: `<pre>${message}</pre>`
|
|
};
|
|
|
|
try {
|
|
await this.transporter.sendMail(emailContent);
|
|
} catch (error) {
|
|
console.error('Failed to send alert email:', error);
|
|
}
|
|
}
|
|
|
|
// Alert when partner sync fails repeatedly
|
|
async alertPartnerSyncFailure(customerId, partnerId, errorCount) {
|
|
if (errorCount >= 5) {
|
|
await this.sendAlert(
|
|
'Partner Sync Failure',
|
|
`Customer ${customerId} has failed to sync with partner ${partnerId} ${errorCount} times consecutively.`,
|
|
'critical'
|
|
);
|
|
}
|
|
}
|
|
|
|
// Alert when partner API is down
|
|
async alertPartnerDown(partnerCode, error) {
|
|
await this.sendAlert(
|
|
'Partner API Down',
|
|
`Partner ${partnerCode} API is not responding: ${error}`,
|
|
'critical'
|
|
);
|
|
}
|
|
}
|
|
|
|
module.exports = new SimpleAlerting();
|
|
```
|
|
|
|
## Dashboard Implementation
|
|
|
|
### Simple HTML Dashboard
|
|
|
|
```html
|
|
<!-- File: public/partner-dashboard.html -->
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<title>Partner Integration Dashboard</title>
|
|
<style>
|
|
.status-healthy { color: green; }
|
|
.status-degraded { color: orange; }
|
|
.status-unhealthy { color: red; }
|
|
.card { border: 1px solid #ddd; margin: 10px; padding: 15px; }
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<h1>Partner Integration Dashboard</h1>
|
|
|
|
<div id="health-status" class="card">
|
|
<h2>System Health</h2>
|
|
<div id="health-content">Loading...</div>
|
|
</div>
|
|
|
|
<div id="partner-stats" class="card">
|
|
<h2>Partner Statistics</h2>
|
|
<div id="stats-content">Loading...</div>
|
|
</div>
|
|
|
|
<script>
|
|
async function loadHealthStatus() {
|
|
try {
|
|
const response = await fetch('/api/health');
|
|
const health = await response.json();
|
|
|
|
document.getElementById('health-content').innerHTML = `
|
|
<p>Overall Status: <span class="status-${health.overall}">${health.overall}</span></p>
|
|
<p>Database: <span class="status-${health.components.database.status}">${health.components.database.status}</span></p>
|
|
<p>Partner Users: <span class="status-${health.components.partnerUsers.status}">${health.components.partnerUsers.status}</span></p>
|
|
<p>Application: <span class="status-${health.components.application.status}">${health.components.application.status}</span></p>
|
|
`;
|
|
} catch (error) {
|
|
document.getElementById('health-content').innerHTML = 'Error loading health status';
|
|
}
|
|
}
|
|
|
|
async function loadPartnerStats() {
|
|
try {
|
|
const response = await fetch('/api/partners/stats');
|
|
const stats = await response.json();
|
|
|
|
document.getElementById('stats-content').innerHTML = `
|
|
<p>Total Partners: ${stats.totalPartners}</p>
|
|
<p>Active Partner Users: ${stats.activePartnerUsers}</p>
|
|
<p>Recent Sync Errors: ${stats.recentErrors}</p>
|
|
`;
|
|
} catch (error) {
|
|
document.getElementById('stats-content').innerHTML = 'Error loading partner stats';
|
|
}
|
|
}
|
|
|
|
// Auto-refresh every 30 seconds
|
|
setInterval(() => {
|
|
loadHealthStatus();
|
|
loadPartnerStats();
|
|
}, 30000);
|
|
|
|
// Initial load
|
|
loadHealthStatus();
|
|
loadPartnerStats();
|
|
</script>
|
|
</body>
|
|
</html>
|
|
```
|
|
|
|
This simplified monitoring approach provides essential visibility without requiring complex infrastructure like Grafana, Prometheus, or extensive logging systems.
|
|
timestamp: new Date(),
|
|
overall: 'healthy',
|
|
components: {},
|
|
duration: 0
|
|
};
|
|
|
|
try {
|
|
// Check database connectivity
|
|
results.components.database = await this.checkDatabase();
|
|
|
|
// Check queue system
|
|
results.components.queues = await this.checkQueues();
|
|
|
|
// Check partner connectivity
|
|
results.components.partners = await this.checkPartners();
|
|
|
|
// Check application health
|
|
results.components.application = await this.checkApplication();
|
|
|
|
// Determine overall health
|
|
results.overall = this.calculateOverallHealth(results.components);
|
|
|
|
} catch (error) {
|
|
results.overall = 'unhealthy';
|
|
results.error = error.message;
|
|
}
|
|
|
|
results.duration = Date.now() - startTime;
|
|
this.healthStatus = results;
|
|
|
|
return results;
|
|
}
|
|
|
|
async checkDatabase() {
|
|
try {
|
|
const start = Date.now();
|
|
await this.database.db.admin().ping();
|
|
|
|
return {
|
|
status: 'healthy',
|
|
responseTime: Date.now() - start,
|
|
details: {
|
|
connected: true,
|
|
readyState: this.database.connection.readyState
|
|
}
|
|
};
|
|
} catch (error) {
|
|
return {
|
|
status: 'unhealthy',
|
|
error: error.message
|
|
};
|
|
}
|
|
}
|
|
|
|
async checkQueues() {
|
|
try {
|
|
const stats = await this.queueSystem.getQueueStats();
|
|
const totalPending = Object.values(stats).reduce((sum, queue) =>
|
|
sum + queue.waiting + queue.active, 0);
|
|
|
|
const isHealthy = totalPending < 1000; // Threshold for healthy queue
|
|
|
|
return {
|
|
status: isHealthy ? 'healthy' : 'degraded',
|
|
details: {
|
|
totalPending,
|
|
stats
|
|
}
|
|
};
|
|
} catch (error) {
|
|
return {
|
|
status: 'unhealthy',
|
|
error: error.message
|
|
};
|
|
}
|
|
}
|
|
|
|
async checkPartners() {
|
|
const partnerResults = {};
|
|
const partners = this.partnerRegistry.getAll();
|
|
|
|
for (const partner of partners) {
|
|
try {
|
|
const result = await partner.healthCheck();
|
|
partnerResults[partner.getPartnerType()] = result;
|
|
} catch (error) {
|
|
partnerResults[partner.getPartnerType()] = {
|
|
status: 'unhealthy',
|
|
error: error.message
|
|
};
|
|
}
|
|
}
|
|
|
|
const healthyPartners = Object.values(partnerResults)
|
|
.filter(p => p.status === 'healthy').length;
|
|
const totalPartners = Object.keys(partnerResults).length;
|
|
|
|
return {
|
|
status: healthyPartners > 0 ? 'healthy' : 'unhealthy',
|
|
healthyCount: healthyPartners,
|
|
totalCount: totalPartners,
|
|
details: partnerResults
|
|
};
|
|
}
|
|
|
|
async checkApplication() {
|
|
try {
|
|
const memoryUsage = process.memoryUsage();
|
|
const cpuUsage = process.cpuUsage();
|
|
const uptime = process.uptime();
|
|
|
|
const memoryThreshold = 1024 * 1024 * 1024; // 1GB
|
|
const isMemoryHealthy = memoryUsage.heapUsed < memoryThreshold;
|
|
|
|
return {
|
|
status: isMemoryHealthy ? 'healthy' : 'degraded',
|
|
details: {
|
|
memory: memoryUsage,
|
|
cpu: cpuUsage,
|
|
uptime,
|
|
nodeVersion: process.version
|
|
}
|
|
};
|
|
} catch (error) {
|
|
return {
|
|
status: 'unhealthy',
|
|
error: error.message
|
|
};
|
|
}
|
|
}
|
|
|
|
calculateOverallHealth(components) {
|
|
const statuses = Object.values(components).map(c => c.status);
|
|
|
|
if (statuses.includes('unhealthy')) {
|
|
return 'unhealthy';
|
|
} else if (statuses.includes('degraded')) {
|
|
return 'degraded';
|
|
} else {
|
|
return 'healthy';
|
|
}
|
|
}
|
|
|
|
getHealthStatus() {
|
|
return this.healthStatus;
|
|
}
|
|
|
|
// Express middleware for health check endpoint
|
|
healthCheckMiddleware() {
|
|
return async (req, res) => {
|
|
const health = await this.performHealthCheck();
|
|
const statusCode = health.overall === 'healthy' ? 200 :
|
|
health.overall === 'degraded' ? 200 : 503;
|
|
|
|
res.status(statusCode).json(health);
|
|
};
|
|
}
|
|
}
|
|
|
|
module.exports = HealthChecker;
|
|
```
|
|
|
|
### 3. Structured Logging
|
|
|
|
```javascript
|
|
// File: services/monitoring/Logger.js
|
|
const winston = require('winston');
|
|
|
|
class Logger {
|
|
constructor() {
|
|
this.logger = winston.createLogger({
|
|
level: process.env.LOG_LEVEL || 'info',
|
|
format: winston.format.combine(
|
|
winston.format.timestamp(),
|
|
winston.format.errors({ stack: true }),
|
|
winston.format.json()
|
|
),
|
|
defaultMeta: {
|
|
service: 'agmission-partner-integration',
|
|
version: process.env.APP_VERSION || '1.0.0'
|
|
},
|
|
transports: [
|
|
new winston.transports.File({
|
|
filename: 'logs/error.log',
|
|
level: 'error'
|
|
}),
|
|
new winston.transports.File({
|
|
filename: 'logs/combined.log'
|
|
})
|
|
]
|
|
});
|
|
|
|
if (process.env.NODE_ENV !== 'production') {
|
|
this.logger.add(new winston.transports.Console({
|
|
format: winston.format.simple()
|
|
}));
|
|
}
|
|
}
|
|
|
|
// Partner operation logging
|
|
logPartnerOperation(operation, partner, data, duration, success = true) {
|
|
const logData = {
|
|
operation,
|
|
partner,
|
|
duration,
|
|
success,
|
|
timestamp: new Date(),
|
|
...data
|
|
};
|
|
|
|
if (success) {
|
|
this.logger.info('Partner operation completed', logData);
|
|
} else {
|
|
this.logger.error('Partner operation failed', logData);
|
|
}
|
|
}
|
|
|
|
// Assignment logging
|
|
logAssignment(jobId, userId, partnerType, status, metadata = {}) {
|
|
this.logger.info('Job assignment', {
|
|
jobId,
|
|
userId,
|
|
partnerType,
|
|
status,
|
|
metadata,
|
|
timestamp: new Date()
|
|
});
|
|
}
|
|
|
|
// Sync operation logging
|
|
logSync(assignmentId, operation, status, duration, metadata = {}) {
|
|
const logData = {
|
|
assignmentId,
|
|
operation,
|
|
status,
|
|
duration,
|
|
metadata,
|
|
timestamp: new Date()
|
|
};
|
|
|
|
if (status === 'success') {
|
|
this.logger.info('Sync operation completed', logData);
|
|
} else {
|
|
this.logger.error('Sync operation failed', logData);
|
|
}
|
|
}
|
|
|
|
// Data processing logging
|
|
logDataProcessing(applicationId, stage, status, metadata = {}) {
|
|
this.logger.info('Data processing stage', {
|
|
applicationId,
|
|
stage,
|
|
status,
|
|
metadata,
|
|
timestamp: new Date()
|
|
});
|
|
}
|
|
|
|
// Error logging with context
|
|
logError(error, context = {}) {
|
|
this.logger.error('Application error', {
|
|
message: error.message,
|
|
stack: error.stack,
|
|
context,
|
|
timestamp: new Date()
|
|
});
|
|
}
|
|
|
|
// Performance logging
|
|
logPerformance(operation, duration, metadata = {}) {
|
|
this.logger.info('Performance metric', {
|
|
operation,
|
|
duration,
|
|
metadata,
|
|
timestamp: new Date()
|
|
});
|
|
}
|
|
}
|
|
|
|
module.exports = new Logger();
|
|
```
|
|
|
|
## Dashboard Configuration
|
|
|
|
### 1. Grafana Dashboard JSON
|
|
|
|
```json
|
|
{
|
|
"dashboard": {
|
|
"id": null,
|
|
"title": "Partner Integration Monitoring",
|
|
"tags": ["agmission", "partners"],
|
|
"timezone": "UTC",
|
|
"panels": [
|
|
{
|
|
"id": 1,
|
|
"title": "Partner API Response Times",
|
|
"type": "stat",
|
|
"targets": [
|
|
{
|
|
"expr": "avg(partner_api_duration_seconds) by (partner)",
|
|
"legendFormat": "{{partner}} avg"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "s",
|
|
"thresholds": {
|
|
"steps": [
|
|
{"color": "green", "value": null},
|
|
{"color": "yellow", "value": 2},
|
|
{"color": "red", "value": 5}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"id": 2,
|
|
"title": "Queue Depth",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "queue_depth",
|
|
"legendFormat": "{{queue_type}} - {{partner}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 3,
|
|
"title": "Assignment Success Rate",
|
|
"type": "stat",
|
|
"targets": [
|
|
{
|
|
"expr": "rate(job_assignments_total{status=\"success\"}[5m]) / rate(job_assignments_total[5m]) * 100",
|
|
"legendFormat": "Success Rate %"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 4,
|
|
"title": "Error Rates by Partner",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "rate(partner_api_requests_total{status!=\"success\"}[5m])",
|
|
"legendFormat": "{{partner}} errors/sec"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"time": {
|
|
"from": "now-1h",
|
|
"to": "now"
|
|
},
|
|
"refresh": "30s"
|
|
}
|
|
}
|
|
```
|
|
|
|
### 2. Alert Rules Configuration
|
|
|
|
```yaml
|
|
# File: monitoring/alerts.yml
|
|
groups:
|
|
- name: partner_integration
|
|
rules:
|
|
- alert: PartnerAPIHighErrorRate
|
|
expr: rate(partner_api_requests_total{status!="success"}[5m]) > 0.1
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
team: integration
|
|
annotations:
|
|
summary: "High error rate for partner {{ $labels.partner }}"
|
|
description: "Partner {{ $labels.partner }} has error rate of {{ $value }} errors/sec"
|
|
|
|
- alert: PartnerAPIDown
|
|
expr: up{job="partner_api"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
team: integration
|
|
annotations:
|
|
summary: "Partner API {{ $labels.partner }} is down"
|
|
description: "Partner {{ $labels.partner }} API has been unreachable for over 1 minute"
|
|
|
|
- alert: QueueDepthHigh
|
|
expr: queue_depth > 1000
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
team: integration
|
|
annotations:
|
|
summary: "High queue depth for {{ $labels.queue_type }}"
|
|
description: "Queue {{ $labels.queue_type }} has {{ $value }} pending jobs"
|
|
|
|
- alert: SyncFailureRateHigh
|
|
expr: rate(sync_duration_seconds{status="failed"}[10m]) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
team: integration
|
|
annotations:
|
|
summary: "High sync failure rate for {{ $labels.partner }}"
|
|
description: "Partner {{ $labels.partner }} sync failure rate is {{ $value }}"
|
|
|
|
- alert: DataProcessingStuck
|
|
expr: increase(data_processing_duration_seconds_count[1h]) == 0
|
|
for: 30m
|
|
labels:
|
|
severity: critical
|
|
team: integration
|
|
annotations:
|
|
summary: "Data processing appears stuck"
|
|
description: "No data processing completions in the last hour"
|
|
```
|
|
|
|
## Troubleshooting Playbook
|
|
|
|
### 1. Partner API Issues
|
|
|
|
#### Symptoms
|
|
- High error rates in partner API calls
|
|
- Timeouts or connection failures
|
|
- Authentication errors
|
|
|
|
#### Investigation Steps
|
|
1. Check partner API status dashboard
|
|
2. Verify network connectivity to partner
|
|
3. Validate API credentials and tokens
|
|
4. Review partner API rate limits
|
|
5. Check partner service logs
|
|
|
|
#### Resolution Actions
|
|
```bash
|
|
# Check partner connectivity
|
|
curl -H "Authorization: Bearer $TOKEN" https://api.partner.com/health
|
|
|
|
# Review recent errors
|
|
kubectl logs -f deployment/agmission-api | grep "partner.*error"
|
|
|
|
# Restart partner service if needed
|
|
kubectl rollout restart deployment/agmission-api
|
|
```
|
|
|
|
### 2. Queue Processing Issues
|
|
|
|
#### Symptoms
|
|
- Increasing queue depth
|
|
- Jobs stuck in pending status
|
|
- Worker processes crashing
|
|
|
|
#### Investigation Steps
|
|
1. Check queue worker health and logs
|
|
2. Verify Redis connectivity
|
|
3. Monitor memory and CPU usage
|
|
4. Check for deadlocks in job processing
|
|
|
|
#### Resolution Actions
|
|
```bash
|
|
# Check queue status
|
|
redis-cli -h redis-host info keyspace
|
|
|
|
# Restart queue workers
|
|
pm2 restart partner-sync-worker
|
|
|
|
# Clear stuck jobs (use with caution)
|
|
redis-cli -h redis-host flushdb
|
|
```
|
|
|
|
### 3. Data Sync Issues
|
|
|
|
#### Symptoms
|
|
- Jobs not syncing to partners
|
|
- Data not being retrieved from partners
|
|
- Sync operations timing out
|
|
|
|
#### Investigation Steps
|
|
1. Check assignment sync states in database
|
|
2. Verify partner job creation
|
|
3. Review sync queue processing
|
|
4. Check for configuration issues
|
|
|
|
#### Resolution Actions
|
|
```javascript
|
|
// Manual sync trigger via API
|
|
POST /api/sync/assignments/{assignmentId}/sync
|
|
{
|
|
"operation": "job_upload",
|
|
"force": true
|
|
}
|
|
|
|
// Database query to check sync status
|
|
db.job_assigns.find({
|
|
"syncState.jobUpload.status": "failed",
|
|
"partnerType": "satloc"
|
|
}).limit(10);
|
|
```
|
|
|
|
### 4. Performance Issues
|
|
|
|
#### Symptoms
|
|
- Slow response times
|
|
- High memory usage
|
|
- Database query timeouts
|
|
|
|
#### Investigation Steps
|
|
1. Monitor application performance metrics
|
|
2. Check database query performance
|
|
3. Review memory and CPU utilization
|
|
4. Analyze slow API endpoints
|
|
|
|
#### Resolution Actions
|
|
```bash
|
|
# Scale up application pods
|
|
kubectl scale deployment agmission-api --replicas=3
|
|
|
|
# Check database performance
|
|
db.runCommand({currentOp: 1, "secs_running": {$gte: 5}})
|
|
|
|
# Monitor memory usage
|
|
kubectl top pods -l app=agmission-api
|
|
```
|
|
|
|
## Incident Response Procedures
|
|
|
|
### 1. Incident Classification
|
|
|
|
| Severity | Response Time | Example Issues |
|
|
|----------|---------------|----------------|
|
|
| P0 - Critical | 15 minutes | Complete system outage, data loss |
|
|
| P1 - High | 1 hour | Partner integration down, major feature broken |
|
|
| P2 - Medium | 4 hours | Performance degradation, minor feature issues |
|
|
| P3 - Low | 24 hours | Cosmetic issues, non-critical bugs |
|
|
|
|
### 2. Escalation Matrix
|
|
|
|
```
|
|
Level 1: On-call Engineer
|
|
├── Initial response and investigation
|
|
├── Follow standard playbooks
|
|
└── Escalate to Level 2 if needed
|
|
|
|
Level 2: Senior Engineer + Team Lead
|
|
├── Complex troubleshooting
|
|
├── Architecture decisions
|
|
└── Escalate to Level 3 if needed
|
|
|
|
Level 3: Architect + Management
|
|
├── System design issues
|
|
├── Business impact decisions
|
|
└── External vendor coordination
|
|
```
|
|
|
|
### 3. Communication Templates
|
|
|
|
#### Initial Alert
|
|
```
|
|
🚨 INCIDENT: Partner Integration Issue
|
|
Severity: P1
|
|
Impact: Satloc jobs not syncing
|
|
Started: 2025-07-18 10:30 UTC
|
|
Assigned: @engineer-oncall
|
|
Status: Investigating
|
|
Next Update: 10:45 UTC
|
|
```
|
|
|
|
#### Status Update
|
|
```
|
|
📊 UPDATE: Partner Integration Issue
|
|
Severity: P1
|
|
Root Cause: Partner API rate limiting
|
|
Action: Implementing exponential backoff
|
|
ETA: 11:00 UTC for resolution
|
|
Next Update: 11:00 UTC
|
|
```
|
|
|
|
#### Resolution Notice
|
|
```
|
|
✅ RESOLVED: Partner Integration Issue
|
|
Duration: 30 minutes
|
|
Root Cause: Partner API rate limiting
|
|
Fix: Updated retry logic with exponential backoff
|
|
Prevention: Added rate limit monitoring
|
|
Post-mortem: Scheduled for 2025-07-19 14:00 UTC
|
|
```
|
|
|
|
This monitoring and observability strategy provides comprehensive visibility into the partner integration system, enabling proactive issue detection and rapid incident response.
|