mirror of https://github.com/OpenIdentityPlatform/OpenDJ.git

copilot-swe-agent[bot]
2 days ago 7121eeb999c8735113b20fc7ac39b22759bdade3
Fix Windows service premature exit and stop timeout in service.c

- Add retry logic (3 retries × 2s) in serviceMain before declaring server
stopped to avoid false negatives from transient lock file check failures
- Increase doStopApplication nTries from 10 to 30 for heavier loaded servers

Agent-Logs-Url: https://github.com/OpenIdentityPlatform/OpenDJ/sessions/b618397b-6f5a-4c9b-8bec-3bb14df3e3e3

Co-authored-by: vharseko <6818498+vharseko@users.noreply.github.com>
1 files modified
70 ■■■■■ changed files
opendj-server-legacy/src/build-tools/windows/service.c 70 ●●●●● patch | view | raw | blame | history
opendj-server-legacy/src/build-tools/windows/service.c
@@ -700,7 +700,7 @@
    if (spawn(command, FALSE) != -1)
    {
      // Try to see if server is really stopped
      int nTries = 10;
      int nTries = 30;
      BOOL running = TRUE;
      debug("doStopApplication: the spawn of the process worked.");
@@ -1225,31 +1225,53 @@
        }
        else
        {
      // Check current Status
      DWORD state;
      BOOL success = getServiceStatus(serviceName, &state);
          if (!(success &&
               ((state == SERVICE_STOPPED) ||
                (state == SERVICE_STOP_PENDING))))
          // Server appears not running - retry a few times before concluding
          // it has actually stopped (the lock file check can be transient,
          // e.g. during JVM GC pressure or heavy I/O after a large ldapsearch).
          // 3 retries × 2 seconds gives up to 6 extra seconds of tolerance.
          int retryCount = 3;
          BOOL confirmedStopped = TRUE;
          while (retryCount > 0)
          {
          WORD argCount = 1;
            const char *argc[] = {_instanceDir};
            _serviceCurStatus = SERVICE_STOPPED;
            debug("checking in serviceMain serviceHandler: service stopped with error.");
            retryCount--;
            Sleep(2000); // wait 2 seconds between retries before re-checking
            code = isServerRunning(&running, TRUE);
            if (code == SERVICE_RETURN_OK && running)
            {
              confirmedStopped = FALSE;
              break;
            }
          }
            updateServiceStatus (
              _serviceCurStatus,
              ERROR_SERVICE_SPECIFIC_ERROR,
              -1,
              CHECKPOINT_NO_ONGOING_OPERATION,
              TIMEOUT_NONE,
              _serviceStatusHandle);
            reportLogEvent(
              EVENTLOG_ERROR_TYPE,
              WIN_EVENT_ID_SERVER_STOPPED_OUTSIDE_SCM,
              argCount, argc);
           }
          break;
          if (confirmedStopped)
          {
            // Check current Status
            DWORD state;
            BOOL success = getServiceStatus(serviceName, &state);
            if (!(success &&
                 ((state == SERVICE_STOPPED) ||
                  (state == SERVICE_STOP_PENDING))))
            {
              WORD argCount = 1;
              const char *argc[] = {_instanceDir};
              _serviceCurStatus = SERVICE_STOPPED;
              debug("checking in serviceMain serviceHandler: service stopped with error.");
              updateServiceStatus (
                _serviceCurStatus,
                ERROR_SERVICE_SPECIFIC_ERROR,
                -1,
                CHECKPOINT_NO_ONGOING_OPERATION,
                TIMEOUT_NONE,
                _serviceStatusHandle);
              reportLogEvent(
                EVENTLOG_ERROR_TYPE,
                WIN_EVENT_ID_SERVER_STOPPED_OUTSIDE_SCM,
                argCount, argc);
            }
            break;
          }
          // else: server is actually still running, continue monitoring
        }
      }
    }