From 2caca3a5c55076f212fe2b2d725769737160c59c Mon Sep 17 00:00:00 2001
From: ludovicp <ludovicp@localhost>
Date: Mon, 31 May 2010 08:33:05 +0000
Subject: [PATCH] Fix for issue #4526. Fixes a race condition in Replication Server when resetting the GenerationID

---
 opends/src/server/org/opends/server/replication/service/ReplicationDomain.java |   84 +++++++++++++++++++++++++++--------------
 1 files changed, 55 insertions(+), 29 deletions(-)

diff --git a/opends/src/server/org/opends/server/replication/service/ReplicationDomain.java b/opends/src/server/org/opends/server/replication/service/ReplicationDomain.java
index 2ee1ec0..de6d555 100644
--- a/opends/src/server/org/opends/server/replication/service/ReplicationDomain.java
+++ b/opends/src/server/org/opends/server/replication/service/ReplicationDomain.java
@@ -591,19 +591,21 @@
   }
 
   /**
-   * Check if a remote replica (DS) is connected to the topology based on
-   * the TopologyMsg we received when the remote replica connected or
-   * disconnected.
+   * Returns informations about the DS server related to the provided serverId.
+   * based on the TopologyMsg we received when the remote replica connected or
+   * disconnected. Return null when no server with the provided serverId is
+   * connected.
    *
-   * @param serverId The provided serverId of the remote replica
-   * @return whether the remote replica is connected or not.
+   * @param  serverId The provided serverId of the remote replica
+   * @return the info related to this remote server if it is connected,
+   *                  null is the server is NOT connected.
    */
-  public boolean isRemoteDSConnected(int serverId)
+  public DSInfo isRemoteDSConnected(int serverId)
   {
     for (DSInfo remoteDS : getReplicasList())
       if (remoteDS.getDsId() == serverId)
-        return true;
-    return false;
+        return remoteDS;
+    return null;
   }
 
   /**
@@ -1670,13 +1672,12 @@
   }
 
   /*
-   * For all remote servers in tht start list,
+   * For all remote servers in the start list,
    * - wait it has finished the import and present the expected generationID
    * - build the failureList
    */
   private void waitForRemoteEndOfInit()
   {
-    int waitResultAttempt = 0;
     Set<Integer> replicasWeAreWaitingFor =  new HashSet<Integer>(0);
 
     for (Integer sid : ieContext.startList)
@@ -1696,36 +1697,60 @@
     do
     {
       done = true;
-      for (DSInfo dsi : getReplicasList())
+      short reconnectMaxDelayInSec = 10;
+      short reconnectWait = 0;
+      for (int serverId : replicasWeAreWaitingFor)
       {
-        if (debugEnabled())
-          TRACER.debugInfo(
-            "[IE] wait for end dsid " + dsi.getDsId()
-            + " " + dsi.getStatus()
-            + " " + dsi.getGenerationId()
-            + " " + this.getGenerationID());
-        if (!ieContext.failureList.contains(dsi.getDsId()))
+        if (ieContext.failureList.contains(serverId))
         {
-          if (dsi.getStatus() == ServerStatus.FULL_UPDATE_STATUS)
+          // this server has already been in error during initialization
+          // dont't wait for it
+          continue;
+        }
+
+        DSInfo dsInfo = null;
+        dsInfo = isRemoteDSConnected(serverId);
+        if (dsInfo == null)
+        {
+          // this server is disconnected
+          // may be for a long time if it crashed or had been stopped
+          // may be just the time to reconnect after import : should be short
+          if (++reconnectWait<reconnectMaxDelayInSec)
+          {
+            // let's still wait to give a chance to this server to reconnect
+            done = false;
+          }
+          else
+          {
+            // we left enough time to the servers to reconnect - now it's too
+            // late
+          }
+        }
+        else
+        {
+          // this server is connected
+          if (dsInfo.getStatus() == ServerStatus.FULL_UPDATE_STATUS)
           {
             // this one is still doing the Full Update ... retry later
             done = false;
-            try
-            { Thread.sleep(1000); } catch (InterruptedException e) {} // 1s
-            waitResultAttempt++;
             break;
           }
           else
           {
             // this one is done with the Full Update
-            if (dsi.getGenerationId() == this.getGenerationID())
+            if (dsInfo.getGenerationId() == this.getGenerationID())
             {
               // and with the expected generationId
-              replicasWeAreWaitingFor.remove(dsi.getDsId());
+              replicasWeAreWaitingFor.remove(serverId);
             }
           }
         }
       }
+
+      // loop and wait
+      if (!done)
+        try { Thread.sleep(1000); } catch (InterruptedException e) {} // 1sec
+
     }
     while ((!done) && (!broker.shuttingDown())); // infinite wait
 
@@ -1921,7 +1946,7 @@
           // Other messages received during an import are trashed except
           // the topologyMsg.
           if ((msg instanceof TopologyMsg) &&
-              (!this.isRemoteDSConnected(ieContext.importSource)))
+              (isRemoteDSConnected(ieContext.importSource)==null))
           {
             Message errMsg =
               Message.raw(Category.SYNC, Severity.NOTICE,
@@ -2013,7 +2038,7 @@
         throw(new IOException(ieContext.getException().getMessage()));
 
       int slowestServerId = ieContext.getSlowestServer();
-      if (!isRemoteDSConnected(slowestServerId))
+      if (isRemoteDSConnected(slowestServerId)==null)
       {
         ieContext.setException(new DirectoryException(ResultCode.OTHER,
             ERR_INIT_HEARTBEAT_LOST_DURING_EXPORT.get(
@@ -2491,12 +2516,14 @@
   {
     boolean allset = true;
 
-    for (int i = 0; i< 10; i++)
+    for (int i = 0; i< 50; i++)
     {
       allset = true;
       for (RSInfo rsInfo : getRsList())
       {
-        if (rsInfo.getGenerationId() != generationID)
+        // the 'empty' RSes (generationId==-1) are considered as good citizens
+        if ((rsInfo.getGenerationId() != -1) &&
+            (rsInfo.getGenerationId() != generationID))
         {
           try
           {
@@ -2513,7 +2540,6 @@
         break;
       }
     }
-
     if (!allset)
     {
       ResultCode resultCode = ResultCode.OTHER;

--
Gitblit v1.10.0