'use strict'; /** * Orphaned Application Details Cleanup Worker * * This script identifies and removes application detail records where the referenced fileId * no longer exists in the AppFile collection. These orphaned records can accumulate over time * when application files are deleted but their corresponding detail records remain. * * Key Features: * - Identifies orphaned application details by checking fileId references * - Uses in-memory caching of AppFile IDs for fast lookup performance * - Time-based processing (yearly/monthly periods) for handling billion+ documents * - Efficient ObjectId timestamp filtering for date ranges * - OPTIMIZED: Skips expensive countDocuments() calls that scan billions of records * - Progressive counting and early termination for empty periods * - Batch processing with configurable batch sizes for large datasets * - Bulk delete operations for efficient cleanup * - Comprehensive progress tracking per time period and overall * - Supports dry-run mode for safe testing * - Implements robust error handling with retry logic * - Follows the same database connection pattern as other worker scripts * * Performance Optimizations (Nov 2024): * - Eliminated countDocuments() calls that were scanning 1+ billion records per time period * - Added quick existence checks to skip empty periods * - Progressive counting shows processing rate instead of percentage * - Configurable counting strategies: skip (default), estimate, or full * * Usage: * # Check and remove orphaned application details for all years (2020-2025) * DEBUG=agm:clean-orphaned-details node server/workers/cleanOrphanedAppDetails.js * * # Run with specific environment file (loads all variables from the file) * set -a && source environment.env && set +a && DEBUG=agm:clean-orphaned-details node server/workers/cleanOrphanedAppDetails.js * set -a && source environment_prod.env && set +a && DEBUG=agm:clean-orphaned-details node server/workers/cleanOrphanedAppDetails.js * * # Run with dotenv (if your project uses it) * DOTENV_CONFIG_PATH=environment.env DEBUG=agm:clean-orphaned-details node -r dotenv/config server/workers/cleanOrphanedAppDetails.js * * # Process only a specific year using command line argument * DEBUG=agm:clean-orphaned-details node server/workers/cleanOrphanedAppDetails.js --specific-year=2024 * * # Process a range of years using command line arguments * DEBUG=agm:clean-orphaned-details node server/workers/cleanOrphanedAppDetails.js --start-year=2022 --end-year=2024 * * # Process a specific date range using command line arguments * DEBUG=agm:clean-orphaned-details node server/workers/cleanOrphanedAppDetails.js --start-date=2024-06-01 --end-date=2024-06-30 * * # Process from a specific date to now * DEBUG=agm:clean-orphaned-details node server/workers/cleanOrphanedAppDetails.js --start-date=2024-01-15 * * # Process with ISO datetime format * DEBUG=agm:clean-orphaned-details node server/workers/cleanOrphanedAppDetails.js --start-date=2024-06-01T10:30:00Z --end-date=2024-06-15T15:45:00Z * * # Fast execution (default - skips expensive counting) * DEBUG=agm:clean-orphaned-details COUNTING_STRATEGY=skip node server/workers/cleanOrphanedAppDetails.js * * # With estimation for progress tracking * DEBUG=agm:clean-orphaned-details COUNTING_STRATEGY=estimate node server/workers/cleanOrphanedAppDetails.js * * # Dry run mode with command line arguments * DEBUG=agm:clean-orphaned-details node server/workers/cleanOrphanedAppDetails.js --dry-run --start-year=2025 * * # Check only mode with command line arguments * DEBUG=agm:clean-orphaned-details node server/workers/cleanOrphanedAppDetails.js --check-only --specific-year=2024 * * # Silent mode - only show output when orphans are found * DEBUG=agm:clean-orphaned-details AGM_SILENT=true node server/workers/cleanOrphanedAppDetails.js * * # Write statistics to custom file * DEBUG=agm:clean-orphaned-details AGM_STATS_FILE=./results/cleanup-2024.json node server/workers/cleanOrphanedAppDetails.js * * # For large datasets (billion+ records), use memory management flags * DEBUG=agm:clean-orphaned-details node --expose-gc --max-old-space-size=8192 scripts/cleanOrphanedAppDetails.js * * # Using environment variables (legacy method) * DEBUG=agm:clean-orphaned-details SPECIFIC_YEAR=2024 node server/workers/cleanOrphanedAppDetails.js * DEBUG=agm:clean-orphaned-details START_YEAR=2022 END_YEAR=2024 node server/workers/cleanOrphanedAppDetails.js * DEBUG=agm:clean-orphaned-details START_DATE=2024-06-01 END_DATE=2024-06-30 node server/workers/cleanOrphanedAppDetails.js * DEBUG=agm:clean-orphaned-details DRY_RUN=true START_DATE=2024-01-01 node server/workers/cleanOrphanedAppDetails.js * * Command Line Arguments: * --dry-run # Only reports what would be deleted without making changes * --check-only # Only check for orphaned records without deleting * --start-year=YYYY # Starting year for processing (default: 2020) * --end-year=YYYY # Ending year for processing (default: current year) * --specific-year=YYYY # Process only a specific year (overrides start/end year) * --start-date=YYYY-MM-DD # Starting date for processing (YYYY-MM-DD or ISO format) * --end-date=YYYY-MM-DD # Ending date for processing (YYYY-MM-DD or ISO format) * --batch-size=N # Number of documents per batch (default: 1000) * --counting-strategy=STRATEGY # skip, estimate, or full (default: skip) * * Environment Variables: * - AGM_DRY_RUN=true # Only reports what would be deleted without making changes * - AGM_BATCH_SIZE=1000 # Number of documents per batch (default: 1000) * - AGM_MAX_RETRIES=3 # Maximum number of retries for errors (default: 3) * - AGM_RETRY_DELAY=1000 # Base delay in ms between retries (default: 1000) * - AGM_SHOW_PROGRESS=true # Whether to show progress indicator (default: true) * - AGM_SILENT=true # Suppress progress output unless orphans are found (default: false) * - AGM_CHECK_ONLY=false # Only check for orphaned records without deleting (default: false) * - AGM_TIME_PERIOD=yearly # Time period for batching: yearly, monthly, or custom (default: yearly) * - AGM_COUNTING_STRATEGY=skip # How to handle document counting: skip, estimate, or full (default: skip) * - AGM_START_YEAR=2020 # Starting year for processing (default: 2020) * - AGM_END_YEAR=2025 # Ending year for processing (default: current year) * - AGM_SPECIFIC_YEAR=2024 # Process only a specific year (overrides START_YEAR/END_YEAR) * - AGM_START_DATE=2024-01-01 # Starting date for processing (YYYY-MM-DD or ISO format) * - AGM_END_DATE=2024-12-31 # Ending date for processing (YYYY-MM-DD or ISO format) * - AGM_STATS_FILE=./cleanup-stats.json # File to write statistics results (default: ./cleanup-stats.json) */ const debug = require('debug')('agm:clean-orphaned-details'); const { DBConnection } = require('../helpers/db/connect.js'); const mongoose = require('mongoose'); const utils = require('../helpers/utils.js'); const AppDetail = require('../model/application_detail.js'); const AppFile = require('../model/application_file.js'); const fs = require('fs').promises; const path = require('path'); /** * Parse command line arguments * @returns {Object} Parsed configuration object */ function parseArguments() { const args = process.argv.slice(2); const config = { dryRun: process.env.AGM_DRY_RUN === 'true' || process.env.DRY_RUN === 'true', batchSize: parseInt(process.env.AGM_BATCH_SIZE || process.env.BATCH_SIZE || '1000', 10), maxRetries: parseInt(process.env.AGM_MAX_RETRIES || process.env.MAX_RETRIES || '3', 10), retryDelay: parseInt(process.env.AGM_RETRY_DELAY || process.env.RETRY_DELAY || '1000', 10), showProgress: (process.env.AGM_SHOW_PROGRESS || process.env.SHOW_PROGRESS || 'true') !== 'false', silent: process.env.AGM_SILENT === 'true' || process.env.SILENT === 'true', checkOnly: process.env.AGM_CHECK_ONLY === 'true' || process.env.CHECK_ONLY === 'true', timePeriod: process.env.AGM_TIME_PERIOD || process.env.TIME_PERIOD || 'yearly', countingStrategy: process.env.AGM_COUNTING_STRATEGY || process.env.COUNTING_STRATEGY || 'skip', startYear: parseInt(process.env.AGM_START_YEAR || process.env.START_YEAR || '2020', 10), endYear: parseInt(process.env.AGM_END_YEAR || process.env.END_YEAR || new Date().getFullYear().toString(), 10), specificYear: (process.env.AGM_SPECIFIC_YEAR || process.env.SPECIFIC_YEAR) ? parseInt(process.env.AGM_SPECIFIC_YEAR || process.env.SPECIFIC_YEAR, 10) : null, startDate: process.env.AGM_START_DATE || process.env.START_DATE || null, endDate: process.env.AGM_END_DATE || process.env.END_DATE || null, statsFile: process.env.AGM_STATS_FILE || './cleanup-stats.json' }; // Parse command line arguments and override environment variables for (let i = 0; i < args.length; i++) { const arg = args[i]; if (arg === '--dry-run') { config.dryRun = true; } else if (arg === '--check-only') { config.checkOnly = true; } else if (arg.startsWith('--start-year=')) { config.startYear = parseInt(arg.split('=')[1], 10); config.specificYear = null; // Clear specific year if start year is provided } else if (arg.startsWith('--end-year=')) { config.endYear = parseInt(arg.split('=')[1], 10); config.specificYear = null; // Clear specific year if end year is provided } else if (arg.startsWith('--specific-year=')) { config.specificYear = parseInt(arg.split('=')[1], 10); } else if (arg.startsWith('--start-date=')) { config.startDate = arg.split('=')[1]; config.specificYear = null; // Clear specific year if start date is provided } else if (arg.startsWith('--end-date=')) { config.endDate = arg.split('=')[1]; config.specificYear = null; // Clear specific year if end date is provided } else if (arg.startsWith('--batch-size=')) { config.batchSize = parseInt(arg.split('=')[1], 10); } else if (arg.startsWith('--counting-strategy=')) { config.countingStrategy = arg.split('=')[1]; } } return config; } // Parse configuration from both environment variables and command line arguments const CONFIG = parseArguments(); // Configuration constants for backward compatibility const DRY_RUN = CONFIG.dryRun; const BATCH_SIZE = CONFIG.batchSize; const MAX_RETRIES = CONFIG.maxRetries; const RETRY_DELAY_MS = CONFIG.retryDelay; const SHOW_PROGRESS = CONFIG.showProgress; const SILENT = CONFIG.silent; const CHECK_ONLY = CONFIG.checkOnly; const TIME_PERIOD = CONFIG.timePeriod; const COUNTING_STRATEGY = CONFIG.countingStrategy; const START_YEAR = CONFIG.startYear; const END_YEAR = CONFIG.endYear; const SPECIFIC_YEAR = CONFIG.specificYear; const START_DATE = CONFIG.startDate; const END_DATE = CONFIG.endDate; const STATS_FILE = CONFIG.statsFile; /** * Create ObjectId from date for filtering * @param {Date|string} date - Date to convert to ObjectId * @returns {mongoose.Types.ObjectId} ObjectId with timestamp */ function createObjectIdFromDate(date) { const dateObj = new Date(date); const timestamp = Math.floor(dateObj.getTime() / 1000); const objectIdHex = timestamp.toString(16) + '0000000000000000'; return new mongoose.Types.ObjectId(objectIdHex); } /** * Quick estimation of document count using sampling (alternative to countDocuments) * This provides a rough estimate without scanning billions of records * @param {Object} periodFilter - MongoDB filter for the time period * @param {string} timePeriodName - Name of the time period for logging * @returns {Promise} Estimated document count */ async function estimateDocumentCount(periodFilter, timePeriodName) { try { // Sample a small number of documents to estimate density const sampleSize = 1000; const sampleDocs = await AppDetail.find(periodFilter) .select('_id') .limit(sampleSize) .lean(); if (sampleDocs.length === 0) return 0; if (sampleDocs.length < sampleSize) return sampleDocs.length; // Use collection stats for rough estimation const collStats = await mongoose.connection.db.collection('application_details').stats(); const totalDocs = collStats.count || collStats.size || 0; // Estimate based on ObjectId time range const startId = sampleDocs[0]._id; const endId = sampleDocs[sampleDocs.length - 1]._id; const timeRangeMs = endId.getTimestamp().getTime() - startId.getTimestamp().getTime(); const totalTimeSpanMs = Date.now() - new Date('2020-01-01').getTime(); // Rough total span const estimatedCount = Math.floor((totalDocs * timeRangeMs) / totalTimeSpanMs); debug(`${timePeriodName} estimated count: ~${estimatedCount.toLocaleString()} (sample-based)`); return estimatedCount; } catch (error) { debug(`Estimation failed for ${timePeriodName}: ${error.message}, falling back to progressive counting`); return -1; // Indicates estimation failed } } /** * Parse date string into Date object with validation * @param {string} dateString - Date string in YYYY-MM-DD or ISO format * @param {string} paramName - Parameter name for error messages * @returns {Date} Parsed date object */ function parseDate(dateString, paramName) { if (!dateString) return null; let date; // Try parsing as ISO string first, then as YYYY-MM-DD if (dateString.includes('T') || dateString.includes('Z')) { date = new Date(dateString); } else { // Assume YYYY-MM-DD format and create at start of day UTC date = new Date(`${dateString}T00:00:00.000Z`); } if (isNaN(date.getTime())) { throw new Error(`Invalid date format for ${paramName}: ${dateString}. Use YYYY-MM-DD or ISO format.`); } return date; } /** * Generate time periods to process based on configuration * @returns {Array} Array of time period objects with start and end dates */ function generateTimePeriods() { const periods = []; // If specific dates are provided, use them (highest priority) if (START_DATE || END_DATE) { const startDate = START_DATE ? parseDate(START_DATE, 'START_DATE') : new Date('2020-01-01T00:00:00.000Z'); const endDate = END_DATE ? parseDate(END_DATE, 'END_DATE') : new Date(); // Current date if not specified // Ensure end date is after start date if (endDate <= startDate) { throw new Error(`End date (${endDate.toISOString()}) must be after start date (${startDate.toISOString()})`); } // For date ranges, create periods based on the time span const daysDiff = Math.ceil((endDate - startDate) / (1000 * 60 * 60 * 24)); if (daysDiff <= 31) { // Single period for ranges up to 1 month periods.push({ name: `Custom Range: ${startDate.toISOString().split('T')[0]} to ${endDate.toISOString().split('T')[0]}`, startDate: startDate, endDate: endDate }); } else if (daysDiff <= 365) { // Monthly periods for ranges up to 1 year let currentDate = new Date(startDate); let periodCount = 1; while (currentDate < endDate) { const periodEnd = new Date(Math.min( new Date(currentDate.getFullYear(), currentDate.getMonth() + 1, 1).getTime(), endDate.getTime() )); periods.push({ name: `Period ${periodCount}: ${currentDate.toISOString().split('T')[0]} to ${periodEnd.toISOString().split('T')[0]}`, startDate: new Date(currentDate), endDate: periodEnd }); currentDate = new Date(currentDate.getFullYear(), currentDate.getMonth() + 1, 1); periodCount++; } } else { // Yearly periods for ranges longer than 1 year let currentDate = new Date(startDate); let periodCount = 1; while (currentDate < endDate) { const periodEnd = new Date(Math.min( new Date(currentDate.getFullYear() + 1, 0, 1).getTime(), endDate.getTime() )); periods.push({ name: `Period ${periodCount}: ${currentDate.toISOString().split('T')[0]} to ${periodEnd.toISOString().split('T')[0]}`, startDate: new Date(currentDate), endDate: periodEnd }); currentDate = new Date(currentDate.getFullYear() + 1, 0, 1); periodCount++; } } } else if (SPECIFIC_YEAR) { // Process the specific year in monthly periods for better memory management and progress tracking debug(`Processing specific year ${SPECIFIC_YEAR} in monthly periods`); for (let month = 0; month < 12; month++) { const startDate = new Date(`${SPECIFIC_YEAR}-${String(month + 1).padStart(2, '0')}-01T00:00:00.000Z`); const endDate = new Date(SPECIFIC_YEAR, month + 1, 1); // First day of next month periods.push({ name: `${SPECIFIC_YEAR}-${String(month + 1).padStart(2, '0')} (${startDate.toLocaleDateString('en-US', { month: 'long', year: 'numeric' })})`, startDate: startDate, endDate: endDate }); } } else { // Process from START_YEAR to END_YEAR // For recent years (which likely have more data), split into monthly periods for (let year = START_YEAR; year <= END_YEAR; year++) { // Split years 2020 and later into monthly periods for better memory management if (year >= 2020) { debug(`Processing year ${year} in monthly periods (recent year with potentially large dataset)`); for (let month = 0; month < 12; month++) { const startDate = new Date(`${year}-${String(month + 1).padStart(2, '0')}-01T00:00:00.000Z`); const endDate = new Date(year, month + 1, 1); // First day of next month periods.push({ name: `${year}-${String(month + 1).padStart(2, '0')} (${startDate.toLocaleDateString('en-US', { month: 'long', year: 'numeric' })})`, startDate: startDate, endDate: endDate }); } } else { // For older years (pre-2020), process as full years since they likely have less data periods.push({ name: `Year ${year}`, startDate: new Date(`${year}-01-01T00:00:00.000Z`), endDate: new Date(`${year + 1}-01-01T00:00:00.000Z`) }); } } } return periods; } /** * Write statistics to JSON file (append mode to preserve history) * @param {Object} stats - Statistics object to write * @param {string} phase - Current phase (e.g., 'period', 'final') * @param {string} periodName - Name of current period (optional) * @param {boolean} force - Force write even if not enough time has passed */ async function writeStatsToFile(stats, phase = 'update', periodName = null, force = false) { try { // Rate limit statistics writing - only write every 30 seconds unless forced const now = Date.now(); if (!force && writeStatsToFile.lastWrite && (now - writeStatsToFile.lastWrite) < 30000) { return; } writeStatsToFile.lastWrite = now; // Read existing statistics file to preserve history let existingData = { sessions: [] }; try { if (await fs.access(STATS_FILE).then(() => true).catch(() => false)) { const existingContent = await fs.readFile(STATS_FILE, 'utf8'); if (existingContent.trim()) { existingData = JSON.parse(existingContent); // Ensure sessions array exists if (!existingData.sessions) { existingData.sessions = []; } } } } catch (parseError) { debug(`Warning: Could not parse existing stats file, starting fresh: ${parseError.message}`); existingData = { sessions: [] }; } // Create new session entry const sessionEntry = { sessionId: stats.sessionId || `session_${Date.now()}`, timestamp: new Date().toISOString(), phase: phase, currentPeriod: periodName, sessionSummary: { totalDeleted: stats.deleted, totalOrphaned: stats.totalOrphaned, periodsProcessed: stats.periodsProcessed, periodsTotal: stats.periodsTotal, errors: stats.errors, sessionStartTime: stats.startTime || new Date().toISOString() }, allPeriods: stats.periodResults, configuration: { dryRun: DRY_RUN, checkOnly: CHECK_ONLY, batchSize: BATCH_SIZE, countingStrategy: COUNTING_STRATEGY, specificYear: SPECIFIC_YEAR, startYear: START_YEAR, endYear: END_YEAR, startDate: START_DATE, endDate: END_DATE } }; // For 'started' phase, create a new session if (phase === 'started') { existingData.sessions.push(sessionEntry); } else { // For other phases, update the current session (last entry) if (existingData.sessions.length > 0) { const currentSession = existingData.sessions[existingData.sessions.length - 1]; // Update the existing session with new data Object.assign(currentSession, sessionEntry); } else { // No existing session, create new one existingData.sessions.push(sessionEntry); } } // Keep only the last 50 sessions to prevent file from growing too large if (existingData.sessions.length > 50) { existingData.sessions = existingData.sessions.slice(-50); } // Add metadata existingData.lastUpdated = new Date().toISOString(); existingData.totalSessions = existingData.sessions.length; await fs.writeFile(STATS_FILE, JSON.stringify(existingData, null, 2), 'utf8'); debug(`Statistics appended to ${STATS_FILE} (session ${sessionEntry.sessionId})`); } catch (error) { debug(`Error writing statistics to file: ${error.message}`); } } /** * Process orphaned records immediately (clean after each period) * @param {Array} orphanedRecords - Array of orphaned records for this period * @param {Object} stats - Statistics object to update * @param {string} periodName - Name of the current period * @returns {Promise} */ async function processOrphanedRecordsImmediately(orphanedRecords, stats, periodName) { if (orphanedRecords.length === 0) { debug(`No orphaned records to process for ${periodName}`); return; } debug(`Processing ${orphanedRecords.length} orphaned records for ${periodName}...`); if (CHECK_ONLY) { debug(`CHECK_ONLY mode: Found ${orphanedRecords.length} orphaned records in ${periodName}`); stats.processed += orphanedRecords.length; stats.dryRunCount += orphanedRecords.length; stats.totalOrphaned += orphanedRecords.length; return; } // Get sample records for reporting const sampleRecords = await getSampleOrphanedRecords(orphanedRecords, 3); if (sampleRecords.length > 0) { debug(`Sample orphaned records from ${periodName}:`); sampleRecords.forEach((record, index) => { const createdDate = record._id.getTimestamp().toISOString(); debug(` ${index + 1}. ID: ${record._id}, FileID: ${record.fileId}, Created: ${createdDate}`); }); } // Process orphaned records in batches for deletion const batches = utils.chunkArray(orphanedRecords, BATCH_SIZE); debug(`Processing ${batches.length} deletion batches for ${periodName}`); for (const batch of batches) { try { stats.batches++; // Delete the batch await deleteBatch(batch, stats); // Small delay between batches to reduce database load if (stats.batches % 10 === 0) { await sleep(100); } } catch (error) { debug(`Error processing deletion batch ${stats.batches} for ${periodName}: ${error.message}`); stats.errors++; // Continue with next batch, but break if too many consecutive errors if (stats.errors > 8) { debug('Too many deletion errors, stopping batch processing'); break; } } } stats.totalOrphaned += orphanedRecords.length; debug(`Completed processing ${orphanedRecords.length} orphaned records for ${periodName}`); } /** * Sleep for specified milliseconds * @param {number} ms - Milliseconds to sleep * @returns {Promise} */ function sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } /** * Retry wrapper for operations with exponential backoff * @param {Function} operation - Operation to retry * @param {string} operationName - Name of the operation for logging * @param {number} maxRetries - Maximum number of retries * @returns {Promise} Result of the operation */ async function withRetry(operation, operationName, maxRetries = MAX_RETRIES) { let lastError; for (let attempt = 0; attempt <= maxRetries; attempt++) { try { return await operation(); } catch (error) { lastError = error; if (attempt === maxRetries) { debug(`${operationName} failed after ${maxRetries + 1} attempts: ${error.message}`); throw error; } const delay = RETRY_DELAY_MS * Math.pow(2, attempt); debug(`${operationName} failed (attempt ${attempt + 1}/${maxRetries + 1}): ${error.message}. Retrying in ${delay}ms...`); await sleep(delay); } } throw lastError; } /** * Load all existing AppFile IDs into memory for fast lookup * @returns {Promise} Set of all existing AppFile _ids */ async function loadAppFileIds() { debug('Loading all AppFile IDs into memory...'); return await withRetry(async () => { const appFiles = await AppFile.find({ markedDelete: { $ne: true } }, { _id: 1 }).lean(); const fileIds = new Set(appFiles.map(file => file._id.toString())); debug(`Loaded ${fileIds.size} AppFile IDs into memory cache`); return fileIds; }, 'Load AppFile IDs'); } /** * Find orphaned application details by checking against in-memory cache for a specific time period * This approach loads all AppFile IDs into memory first, then checks each application detail within the time range * Uses cursor-based pagination instead of skip() for better performance on large datasets * @param {Object} timePeriod - Time period object with startDate and endDate * @param {Set} existingFileIds - Set of existing AppFile IDs for lookup * @returns {Promise} Array of orphaned application detail documents */ async function findOrphanedAppDetailsForPeriod(timePeriod, existingFileIds) { const periodStartTime = new Date(); debug(`Finding orphaned application details for ${timePeriod.name}...`); // Create ObjectId filters for the time period const startObjectId = createObjectIdFromDate(timePeriod.startDate); const endObjectId = createObjectIdFromDate(timePeriod.endDate); debug(`Period: ${timePeriod.startDate.toISOString()} to ${timePeriod.endDate.toISOString()}`); debug(`ObjectId range: ${startObjectId} to ${endObjectId}`); // Get total count of application details for this period const periodFilter = { _id: { $gte: startObjectId, $lt: endObjectId } }; // Handle counting strategy: skip, estimate, or full let totalAppDetails = -1; // -1 indicates progressive counting if (COUNTING_STRATEGY === 'full') { // Original expensive approach - scan all documents totalAppDetails = await withRetry(async () => { return await AppDetail.countDocuments(periodFilter); }, `Count application details for ${timePeriod.name}`); debug(`Checking ${totalAppDetails} application details in ${timePeriod.name} against ${existingFileIds.size} existing file IDs`); if (totalAppDetails === 0) { debug(`No application details found for ${timePeriod.name}`); return { found: 0, processed: 0 }; } } else if (COUNTING_STRATEGY === 'estimate') { // Use estimation for approximate progress tracking totalAppDetails = await estimateDocumentCount(periodFilter, timePeriod.name); if (totalAppDetails === 0) { debug(`No application details found for ${timePeriod.name} - skipping`); return { found: 0, processed: 0 }; } else if (totalAppDetails > 0) { debug(`Estimated ${totalAppDetails.toLocaleString()} application details in ${timePeriod.name} (checking against ${existingFileIds.size} existing file IDs)`); } else { // Estimation failed, fall back to progressive counting debug(`Estimation failed for ${timePeriod.name}, using progressive counting`); totalAppDetails = -1; } } else { // Default: skip counting entirely - use progressive counting debug(`Scanning ${timePeriod.name} for orphaned application details (progressive counting enabled)`); debug(`Period filter: _id >= ${startObjectId} and _id < ${endObjectId}`); debug(`Will check against ${existingFileIds.size} existing file IDs`); // Quick check if period has any data by fetching just one document const hasData = await withRetry(async () => { const sample = await AppDetail.findOne(periodFilter).select('_id').lean(); return !!sample; }, `Check if ${timePeriod.name} has data`); if (!hasData) { debug(`No application details found for ${timePeriod.name} - skipping`); return { found: 0, processed: 0 }; } } // Use streaming approach - process in chunks without accumulating all orphaned records let processed = 0; let totalOrphaned = 0; let lastId = startObjectId; const checkBatchSize = Math.min(BATCH_SIZE, 5000); // Use smaller batches for memory checking // Adaptive progress reporting - less frequent for large datasets const progressInterval = existingFileIds.size > 1000000 ? checkBatchSize * 50 : checkBatchSize * 10; while (true) { // Use cursor-based pagination instead of skip() for better performance const cursorFilter = { _id: { // Use $gt if we have processed records (lastId), otherwise start from beginning with $gte [lastId.equals(startObjectId) ? '$gte' : '$gt']: lastId, $lt: endObjectId } }; // Fetch batch of application details for this time period const appDetailsBatch = await withRetry(async () => { return await AppDetail.find(cursorFilter) .select('_id fileId') .sort({ _id: 1 }) .limit(checkBatchSize) .lean(); }, `Fetch application details batch for ${timePeriod.name} (lastId: ${lastId})`); if (appDetailsBatch.length === 0) { break; // No more records } // Process this batch immediately - find orphaned records and process them const orphanedBatch = []; for (const appDetail of appDetailsBatch) { // Skip records with missing or null fileId - these are legacy data if (!appDetail.fileId) { // Skip this record, probably used appId originally, don't count it as orphaned processed++; continue; } if (!existingFileIds.has(appDetail.fileId.toString())) { orphanedBatch.push({ _id: appDetail._id, fileId: appDetail.fileId }); } processed++; } // Process orphaned records from this batch immediately to avoid memory accumulation if (orphanedBatch.length > 0) { totalOrphaned += orphanedBatch.length; // Process immediately if not in CHECK_ONLY mode if (!CHECK_ONLY) { await processOrphanedRecordsImmediately(orphanedBatch, { deleted: 0, errors: 0, batches: 0 }, timePeriod.name); } debug(`Processed ${orphanedBatch.length} orphaned records from batch (Total orphaned so far: ${totalOrphaned})`); } // Update lastId for cursor-based pagination if (appDetailsBatch.length > 0) { // Set lastId to the last document's _id from this batch const lastDoc = appDetailsBatch[appDetailsBatch.length - 1]; lastId = lastDoc._id; // For next iteration, we'll use $gt instead of $gte to avoid duplicates // So we need to slightly modify the cursor filter } // Show progress for the checking phase, unless silent mode is enabled and no orphans found // Less frequent progress reporting for large datasets if (!SILENT && SHOW_PROGRESS && processed % progressInterval === 0) { const elapsedSeconds = (Date.now() - periodStartTime.getTime()) / 1000; const rate = processed / (elapsedSeconds || 1); if (totalAppDetails > 0) { // Show percentage progress when we have total count (estimate or full) const percentage = ((processed / totalAppDetails) * 100).toFixed(1); debug(`${timePeriod.name} progress: ${processed}/${totalAppDetails} (${percentage}%) ${totalAppDetails > 1000000 ? '[estimated]' : ''} - Found ${totalOrphaned} orphaned so far`); } else { // Progressive counting mode - show rate only debug(`${timePeriod.name} progress: ${processed} processed (${rate.toFixed(1)} records/sec) - Found ${totalOrphaned} orphaned so far`); } } // Break if we've processed fewer records than requested (end of collection) if (appDetailsBatch.length < checkBatchSize) { break; } // Small delay to prevent overwhelming the database - longer for large datasets const delayMs = existingFileIds.size > 1000000 ? 50 : 10; await sleep(delayMs); // Force garbage collection every 100 batches to prevent memory buildup if (processed % (checkBatchSize * 100) === 0) { if (global.gc) { global.gc(); debug(`Forced garbage collection at ${processed} records processed`); } } } debug(`Completed checking ${processed} application details for ${timePeriod.name} - Found ${totalOrphaned} orphaned records`); return { found: totalOrphaned, processed: totalOrphaned }; } /** * Get sample of orphaned records for reporting * @param {Array} orphanedIds - Array of orphaned document _ids * @param {number} sampleSize - Number of sample records to retrieve * @returns {Promise} Sample of orphaned records */ async function getSampleOrphanedRecords(orphanedIds, sampleSize = 5) { if (orphanedIds.length === 0) return []; const sampleIds = orphanedIds.slice(0, sampleSize).map(doc => doc._id); return await withRetry(async () => { return await AppDetail.find({ _id: { $in: sampleIds } }).select('_id fileId lat lon').lean(); }, 'Get sample orphaned records'); } /** * Delete a batch of orphaned application details * @param {Array} batch - Batch of document _ids to delete * @param {Object} stats - Statistics object to update * @returns {Promise} */ async function deleteBatch(batch, stats) { if (!batch || batch.length === 0) { return; } const batchIds = batch.map(doc => doc._id); debug(`Processing batch of ${batchIds.length} orphaned records...`); if (DRY_RUN || CHECK_ONLY) { debug(`${DRY_RUN ? 'DRY RUN' : 'CHECK ONLY'}: Would delete ${batchIds.length} orphaned application details`); stats.processed += batchIds.length; stats.dryRunCount += batchIds.length; return; } // Execute bulk delete operation with retry const result = await withRetry(async () => { return await AppDetail.deleteMany({ _id: { $in: batchIds } }); }, `Delete batch of ${batchIds.length} orphaned records`); // Update statistics stats.processed += batchIds.length; stats.deleted += result.deletedCount || 0; debug(`Batch completed: ${result.deletedCount} records deleted`); } /** * Display progress information * @param {number} processed - Number of documents processed * @param {number} total - Total number of documents * @param {Date} startTime - Start time of the operation */ function showProgress(processed, total, startTime) { if (!SHOW_PROGRESS || total === 0) return; const elapsed = Date.now() - startTime.getTime(); const rate = processed / (elapsed / 1000); const remaining = total - processed; const eta = remaining > 0 ? remaining / rate : 0; const percentage = ((processed / total) * 100).toFixed(1); const formatTime = (seconds) => { const hours = Math.floor(seconds / 3600); const minutes = Math.floor((seconds % 3600) / 60); const secs = Math.floor(seconds % 60); if (hours > 0) { return `${hours}h ${minutes}m ${secs}s`; } else if (minutes > 0) { return `${minutes}m ${secs}s`; } else { return `${secs}s`; } }; debug(`Progress: ${processed}/${total} (${percentage}%) | Rate: ${rate.toFixed(1)} records/sec | ETA: ${formatTime(eta)}`); } /** * Main function to clean orphaned application details * @returns {Promise} Statistics about the cleanup operation */ async function cleanOrphanedAppDetails() { const startTime = new Date(); debug(`Starting orphaned application details cleanup...`); debug(`Configuration:`); debug(` - DRY_RUN: ${DRY_RUN}`); debug(` - CHECK_ONLY: ${CHECK_ONLY}`); debug(` - BATCH_SIZE: ${BATCH_SIZE}`); debug(` - TIME_PERIOD: ${TIME_PERIOD}`); debug(` - COUNTING_STRATEGY: ${COUNTING_STRATEGY} (skip=fastest, estimate=approximate, full=slow)`); debug(` - SPECIFIC_YEAR: ${SPECIFIC_YEAR || 'none'}`); debug(` - START_YEAR: ${START_YEAR}`); debug(` - END_YEAR: ${END_YEAR}`); debug(` - START_DATE: ${START_DATE || 'none'}`); debug(` - END_DATE: ${END_DATE || 'none'}`); debug(` - Date range mode: ${START_DATE || END_DATE ? 'ENABLED' : 'DISABLED'}`); debug(` - Years to process: ${SPECIFIC_YEAR || (START_DATE || END_DATE ? 'custom date range' : `${START_YEAR}-${END_YEAR}`)}`); debug(` - Command line args: ${process.argv.slice(2).join(' ') || 'none'}`); // Debug environment variable sources debug('Environment Variable Sources:'); debug(` - process.env.AGM_START_YEAR: "${process.env.AGM_START_YEAR || 'undefined'}"`); debug(` - process.env.START_YEAR: "${process.env.START_YEAR || 'undefined'}"`); debug(` - process.env.AGM_END_YEAR: "${process.env.AGM_END_YEAR || 'undefined'}"`); debug(` - process.env.END_YEAR: "${process.env.END_YEAR || 'undefined'}"`); debug(` - process.env.AGM_START_DATE: "${process.env.AGM_START_DATE || 'undefined'}"`); debug(` - process.env.START_DATE: "${process.env.START_DATE || 'undefined'}"`); debug(` - process.env.AGM_END_DATE: "${process.env.AGM_END_DATE || 'undefined'}"`); debug(` - process.env.END_DATE: "${process.env.END_DATE || 'undefined'}"`); debug(` - process.env.AGM_SPECIFIC_YEAR: "${process.env.AGM_SPECIFIC_YEAR || 'undefined'}"`); debug(` - process.env.SPECIFIC_YEAR: "${process.env.SPECIFIC_YEAR || 'undefined'}"`); // Initialize statistics const stats = { sessionId: `session_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`, processed: 0, deleted: 0, errors: 0, dryRunCount: 0, totalOrphaned: 0, batches: 0, periodsProcessed: 0, periodsTotal: 0, periodResults: [], // Track results per period startTime: new Date().toISOString() // Track session start time }; try { // Load all AppFile IDs into memory once at the beginning debug('Loading AppFile IDs into memory...'); const allFileIds = await loadAppFileIds(); // Generate time periods to process const timePeriods = generateTimePeriods(); stats.periodsTotal = timePeriods.length; debug(`Processing ${timePeriods.length} time periods:`); timePeriods.forEach(period => { debug(` - ${period.name}: ${period.startDate.toISOString().split('T')[0]} to ${period.endDate.toISOString().split('T')[0]}`); }); // Write initial statistics await writeStatsToFile(stats, 'started', null, true); for (const timePeriod of timePeriods) { const periodStartTime = new Date(); try { debug(`\n${'='.repeat(60)}`); debug(`Processing ${timePeriod.name}...`); debug(`${'='.repeat(60)}`); // Find orphaned records for this time period (streaming approach) const periodResult = await findOrphanedAppDetailsForPeriod(timePeriod, allFileIds); // Track period results const periodStats = { name: timePeriod.name, startDate: timePeriod.startDate.toISOString(), endDate: timePeriod.endDate.toISOString(), orphanedFound: periodResult.found, processed: periodResult.processed, deleted: periodResult.processed, // In streaming mode, found = processed = deleted (unless CHECK_ONLY) errors: 0, duration: (Date.now() - periodStartTime.getTime()) / 1000 }; if (periodResult.found > 0) { debug(`Found and processed ${periodResult.found} orphaned records in ${timePeriod.name}`); // Update global stats stats.totalOrphaned += periodResult.found; stats.processed += periodResult.processed; if (!CHECK_ONLY) { stats.deleted += periodResult.processed; } } else { debug(`No orphaned records found in ${timePeriod.name}`); } stats.periodResults.push(periodStats); stats.periodsProcessed++; // Write updated statistics after each period await writeStatsToFile(stats, 'period-completed', timePeriod.name, true); debug(`Period ${timePeriod.name} completed in ${periodStats.duration.toFixed(2)}s`); } catch (error) { debug(`Error processing ${timePeriod.name}: ${error.message}`); stats.errors++; // Track failed period const periodResult = { name: timePeriod.name, startDate: timePeriod.startDate.toISOString(), endDate: timePeriod.endDate.toISOString(), orphanedFound: 0, processed: 0, deleted: 0, errors: 1, duration: (Date.now() - periodStartTime.getTime()) / 1000, error: error.message }; stats.periodResults.push(periodResult); // Continue with next period unless too many errors if (stats.errors > 3) { debug('Too many period errors, stopping operation'); break; } } } if (stats.totalOrphaned === 0) { debug('\nNo orphaned application details found across all time periods. Database is clean!'); await writeStatsToFile(stats, 'completed-clean', null, true); return stats; } const endTime = new Date(); const duration = (endTime.getTime() - startTime.getTime()) / 1000; debug('\nCleanup operation completed!'); debug('='.repeat(50)); debug(`Time periods processed: ${stats.periodsProcessed}/${stats.periodsTotal}`); debug(`Total orphaned records found: ${stats.totalOrphaned}`); debug(`Total deletion batches processed: ${stats.batches}`); debug(`Records processed: ${stats.processed}`); if (DRY_RUN) { debug(`Dry run count: ${stats.dryRunCount}`); } else if (CHECK_ONLY) { debug(`Check only count: ${stats.dryRunCount}`); } else { debug(`Records deleted: ${stats.deleted}`); } debug(`Errors encountered: ${stats.errors}`); debug(`Duration: ${duration.toFixed(2)} seconds`); debug(`Average rate: ${stats.processed > 0 ? (stats.processed / duration).toFixed(1) : 0} records/second`); debug(`Statistics written to: ${STATS_FILE}`); debug('='.repeat(50)); // Write final statistics stats.totalDuration = duration; stats.completedAt = endTime.toISOString(); await writeStatsToFile(stats, 'completed', null, true); return stats; } catch (error) { debug(`Fatal error during cleanup operation: ${error.message}`); throw error; } } /** * Set up global process error handling */ process .on('uncaughtException', function (err) { debug('Uncaught Exception:', err); process.exit(1); }) .on('unhandledRejection', (reason, p) => { debug('Unhandled Rejection at Promise:', p, 'reason:', reason); process.exit(1); }); /** * Main execution - follows the same pattern as other worker scripts */ async function main() { const dbConn = new DBConnection('Clean Orphaned App Details Script'); try { await dbConn.initialize({ setupExitHandlers: false }); debug('Database connected'); // Run the cleanup operation const result = await cleanOrphanedAppDetails(); // Log final result if (result.errors > 0) { debug(`Operation completed with ${result.errors} errors`); } else { debug('Operation completed successfully'); } } catch (error) { debug('Operation failed:', error); } finally { await dbConn.close(); process.exit(); } } // Execute main function main();