Parcourir la source

fix: eliminate stuck job root causes — button visibility, watchdog, and cancel bugs

- isAdmin: check user.globalRole === 'ADMIN' instead of project membership
  (button was invisible to workspace admins who weren't project ADMINs)
- Add GET /api/assets/admin/stuck-count: lightweight workspace-wide PROCESSING count
  (registered before /:id to prevent Express shadowing bug)
- TranscodeTasksPanel: use globalStuckCount (polled from API every 30s) for
  button disabled/badge state instead of per-page count
- Watchdog: move from poll() idle path to setInterval(STUCK_TIMEOUT_MS) so
  stuck jobs are detected even when the queue is perpetually full
- Watchdog grace period: recentlyReset Set prevents same worker from immediately
  re-claiming a job it just watchdog-reset (WATCHDOG_GRACE_MS = 50% of timeout or 30s)
- Cancel endpoint: also clears transcodePaused so paused+cancelled jobs are re-claimable
- docker-compose.dev.yml: pass STUCK_TIMEOUT_MS (default 300s) to worker container
- Dockerfile.api: remove read-only file system DNS override that broke builds

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
kingkong il y a 1 mois
Parent
commit
0e852341e3

+ 0 - 6
Dockerfile.api

@@ -1,12 +1,6 @@
 FROM node:22-slim
 
 # Install FFmpeg + wget (for thumbnail generation and healthcheck)
-# DNS: 1.1.1.1 for fast DNS resolution; fallback to security.debian.org mirror
-RUN cat > /etc/resolv.conf <<EOF
-nameserver 1.1.1.1
-nameserver 8.8.8.8
-nameserver 8.26.56.26
-EOF
 RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg wget ca-certificates && rm -rf /var/lib/apt/lists/*
 
 WORKDIR /app

+ 1 - 0
docker-compose.dev.yml

@@ -78,6 +78,7 @@ services:
             POLL_INTERVAL_MS: ${POLL_INTERVAL_MS}
             WORKER_CONCURRENCY: ${WORKER_CONCURRENCY}
             ENCODER: ${ENCODER}
+            STUCK_TIMEOUT_MS: ${STUCK_TIMEOUT_MS:-300000}
         depends_on:
             postgres-dev:
                 condition: service_healthy

+ 19 - 0
packages/api/src/routes/assets.ts

@@ -381,6 +381,7 @@ router.post('/:id/transcode/cancel', async (req: Request, res: Response) => {
         transcodeProgress: 0,
         transcodeError: null,
         hlsPath: null,
+        transcodePaused: false,
       },
     });
 
@@ -511,6 +512,24 @@ router.delete('/:id', async (req: Request, res: Response) => {
   }
 });
 
+// ── Admin routes (registered before /:id so /admin/* is never shadowed) ────
+
+// GET /api/assets/admin/stuck-count — lightweight workspace-wide stuck job count
+router.get('/admin/stuck-count', async (req: Request, res: Response) => {
+  try {
+    if (req.user!.globalRole !== 'ADMIN') {
+      res.status(403).json({ error: 'Admin access required' });
+      return;
+    }
+    const count = await prisma.asset.count({
+      where: { transcodeStatus: 'PROCESSING', transcodePaused: false },
+    });
+    res.json({ count });
+  } catch (err) {
+    res.status(500).json({ error: 'Internal server error' });
+  }
+});
+
 // POST /api/assets/admin/reprocess-all — admin-only: reset all PROCESSING jobs to PENDING
 router.post('/admin/reprocess-all', async (req: Request, res: Response) => {
   try {

+ 36 - 4
packages/api/src/worker/index.js

@@ -420,6 +420,8 @@ async function processJob(asset) {
 }
 
 /** ── Watchdog: re-reset jobs that have been PROCESSING too long without progress ── */
+// Runs independently of the job queue on a timer — catches stuck jobs even when
+// the queue is perpetually full and claimOneJob() never returns null.
 async function resetStuckJobs() {
   const cutoff = new Date(Date.now() - STUCK_TIMEOUT_MS);
   try {
@@ -436,14 +438,39 @@ async function resetStuckJobs() {
         where: { id: { in: stuck.map(s => s.id) } },
         data: { transcodeStatus: 'PENDING', transcodeProgress: 0 },
       });
+      // Put reset jobs into grace period so this worker doesn't immediately re-claim them
+      for (const job of stuck) {
+        recentlyReset.set(job.id, Date.now() + WATCHDOG_GRACE_MS);
+      }
       log('watchdog_reset', { count: stuck.length, cutoffSeconds: STUCK_TIMEOUT_MS / 1000 });
     }
   } catch {}
 }
 
+// Grace period (ms) after a watchdog reset before the job can be re-claimed.
+// Prevents a tight reset → re-claim → fail → reset loop within the same process tick.
+const WATCHDOG_GRACE_MS = Math.max(STUCK_TIMEOUT_MS * 0.5, 30_000); // 50% of timeout or 30s min
+
+// Set of job IDs recently reset by watchdog — excluded from claim for WATCHDOG_GRACE_MS.
+// Reset on each worker restart so stale entries don't persist.
+const recentlyReset = new Set();
+setInterval(() => {
+  // Clean up expired entries from the grace set
+  for (const [jobId, expiresAt] of recentlyReset) {
+    if (Date.now() > expiresAt) recentlyReset.delete(jobId);
+  }
+}, STUCK_TIMEOUT_MS); // clean up roughly once per timeout period
+
 /** ── Claim one job (atomic, skip locked) ─────────────────────────────── */
+// recentlyReset jobs are excluded for WATCHDOG_GRACE_MS to prevent tight
+// reset → re-claim → fail → reset loops.
 async function claimOneJob() {
-  const result = await prisma.$executeRaw`
+  const resetIds = Array.from(recentlyReset.keys());
+  const resetCondition = resetIds.length > 0
+    ? `AND id NOT IN (${resetIds.map(id => `'${id}'`).join(',')})`
+    : '';
+
+  const result = await prisma.$executeRawUnsafe(`
     UPDATE "Asset"
     SET    "transcodeStatus" = 'PROCESSING',
            "transcodeProgress" = 0,
@@ -452,11 +479,12 @@ async function claimOneJob() {
       SELECT id FROM "Asset"
       WHERE  "transcodeStatus" = 'PENDING'
         AND  "transcodePaused" = false
+        ${resetCondition}
       ORDER  BY "createdAt" ASC
       LIMIT  1
       FOR    UPDATE SKIP LOCKED
     )
-  `;
+  `);
 
   if (!result || result === 0) return null;
 
@@ -472,8 +500,8 @@ async function poll() {
   try {
     const claimed = await claimOneJob();
     if (!claimed) {
-      // No job — check for stuck jobs and sleep with backoff
-      await resetStuckJobs();
+      // Queue is empty — sleep and retry.
+      // Watchdog runs on its own timer (see main()), so nothing to do here.
       await sleep(BACKOFF_MS);
       return poll();
     }
@@ -492,6 +520,10 @@ async function main() {
   // Don't run the worker entry point in master — it only forks children
   if (cluster.isMaster) return;
   console.log(`[worker:${WORKER_ID}] Started, ENCODER=${ENCODER}`);
+  // Watchdog timer: independent of poll loop — catches stuck jobs even when queue is full
+  setInterval(async () => {
+    await resetStuckJobs();
+  }, STUCK_TIMEOUT_MS);
   // Start polling immediately — tight recursive loop with backoff when idle
   poll().catch(err => console.error(`[worker:${WORKER_ID}] Fatal poll error:`, err.message));
   log('ready', {});

+ 19 - 3
src/app/(dashboard)/projects/[projectId]/page.tsx

@@ -188,14 +188,29 @@ export default function ProjectDetailPage() {
   const [copiedInviteId, setCopiedInviteId] = useState<string | null>(null);
   const [inviteUrlMap, setInviteUrlMap] = useState<Record<string, string>>({});
   const [reprocessingAll, setReprocessingAll] = useState(false);
+  const [globalStuckCount, setGlobalStuckCount] = useState(0);
 
   const canManage = members.some(m =>
     m.user.id === user?.id && ['ADMIN', 'EDITOR'].includes(m.role)
   );
   const isOwner = project?.ownerId === user?.id;
-  const isAdmin = members.some(m =>
-    m.user.id === user?.id && m.role === 'ADMIN'
-  );
+  const isAdmin = user?.globalRole === 'ADMIN';
+
+  // Poll workspace-wide stuck job count every 30s (only for admins)
+  useEffect(() => {
+    if (!isAdmin || !token) return;
+    let cancelled = false;
+    async function fetchStuckCount() {
+      const t = token as string; // guarded above
+      try {
+        const data = await assetsApi.getStuckCount(t);
+        if (!cancelled) setGlobalStuckCount(data.count ?? 0);
+      } catch {}
+    }
+    fetchStuckCount();
+    const id = setInterval(fetchStuckCount, 30_000);
+    return () => { cancelled = true; clearInterval(id); };
+  }, [isAdmin, token]);
 
   // ── Folder data derived from state ──────────────────────────────────────────
   // For file mode: only assets directly in the selected folder
@@ -998,6 +1013,7 @@ export default function ProjectDetailPage() {
                 finally { setReprocessingAll(false); }
               }}
               isReprocessingAll={reprocessingAll}
+              globalStuckCount={globalStuckCount}
             />
           </div>
         )}

+ 7 - 5
src/components/transcode/TranscodeTasksPanel.tsx

@@ -1,7 +1,7 @@
 'use client';
 
-import { useState } from 'react';
-import { Asset, TranscodeStatus } from '@/lib/api';
+import { useState, useEffect, useCallback } from 'react';
+import { Asset, TranscodeStatus, assetsApi } from '@/lib/api';
 
 interface Props {
   assets: Asset[];
@@ -15,6 +15,8 @@ interface Props {
   onReprocess: (id: string) => void;
   onReprocessAll: () => void;
   isReprocessingAll: boolean;
+  /** Polls workspace-wide stuck count every 30s when isAdmin is true */
+  globalStuckCount: number;
 }
 
 const STATUS_CONFIG: Record<TranscodeStatus, {
@@ -249,7 +251,7 @@ function TranscodeTaskRow({
   );
 }
 
-export function TranscodeTasksPanel({ assets, canManage, isAdmin, onDelete, onCancel, onPause, onResume, onReprocess, onReprocessAll, isReprocessingAll }: Props) {
+export function TranscodeTasksPanel({ assets, canManage, isAdmin, onDelete, onCancel, onPause, onResume, onReprocess, onReprocessAll, isReprocessingAll, globalStuckCount }: Props) {
   const [filter, setFilter] = useState<'all' | 'processing' | 'completed' | 'failed'>('all');
 
   const filtered = assets.filter(a => {
@@ -299,7 +301,7 @@ export function TranscodeTasksPanel({ assets, canManage, isAdmin, onDelete, onCa
         <div className="flex justify-end mb-4">
           <button
             onClick={onReprocessAll}
-            disabled={isReprocessingAll || stuckCount === 0}
+            disabled={isReprocessingAll || globalStuckCount === 0}
             className="flex items-center gap-2 px-3 py-1.5 rounded-lg text-xs font-medium transition-all disabled:opacity-40"
             style={{
               background: 'rgba(251,146,60,0.10)',
@@ -316,7 +318,7 @@ export function TranscodeTasksPanel({ assets, canManage, isAdmin, onDelete, onCa
               </svg>
             )}
             <span>{isReprocessingAll ? 'Resetting…' : 'Force Reprocess All'}</span>
-            {stuckCount > 0 && !isReprocessingAll && (
+            {globalStuckCount > 0 && !isReprocessingAll && (
               <span className="text-[10px] px-1.5 py-0.5 rounded-full" style={{ background: 'rgba(251,146,60,0.20)' }}>
                 {stuckCount} stuck
               </span>

+ 3 - 0
src/lib/api.ts

@@ -157,6 +157,9 @@ export const assetsApi = {
 
   reprocessAll: (token: string) =>
     apiFetch<{ message: string; count: number }>(`/api/assets/admin/reprocess-all`, { method: 'POST', token }),
+
+  getStuckCount: (token: string) =>
+    apiFetch<{ count: number }>(`/api/assets/admin/stuck-count`, { token }),
 };
 
 // ── Comments ─────────────────────────────────────────────────────────────────