From 9e1f377c4f21b899d16f4c62450c68691f4b42a8 Mon Sep 17 00:00:00 2001
From: Ludovic Poitou <ludovic.poitou@forgerock.com>
Date: Thu, 20 Jun 2013 15:02:35 +0000
Subject: [PATCH] Fix for OPENDJ-846, Intermittent Replication failure. The issue was triggered by the mix of AssuredReplication and bad network conditions, which resulted in a deadlock between 2 RS, as both were blocked on writing to the TCP socket and not reading (because waiting on the write lock). The solution (more of a workaround) is to have another thread for sending data to the socket and have the reader and writer posting data to send to a queue that this new thread is polling. There are still potential deadlocks but they will occur much later, if the sendQueue gets full. The code needs more work post 2.6 to be fully non blocking, but the changes are enough for now to resolve the customer deadlock case.
---
opends/src/server/org/opends/server/replication/server/ReplicationServerDomain.java | 49 +++++++++++++++++++------------------------------
1 files changed, 19 insertions(+), 30 deletions(-)
diff --git a/opends/src/server/org/opends/server/replication/server/ReplicationServerDomain.java b/opends/src/server/org/opends/server/replication/server/ReplicationServerDomain.java
index a6ebfc1..47f3d7f 100644
--- a/opends/src/server/org/opends/server/replication/server/ReplicationServerDomain.java
+++ b/opends/src/server/org/opends/server/replication/server/ReplicationServerDomain.java
@@ -187,7 +187,7 @@
// The timer used to run the timeout code (timer tasks) for the assured update
// messages we are waiting acks for.
private Timer assuredTimeoutTimer = null;
- // Counter used to purge the timer tasks referemces in assuredTimeoutTimer,
+ // Counter used to purge the timer tasks references in assuredTimeoutTimer,
// every n number of treated assured messages
private int assuredTimeoutTimerPurgeCounter = 0;
@@ -588,17 +588,16 @@
if (serverStatus == ServerStatus.DEGRADED_STATUS)
{
wrongStatusServers.add(handler.getServerId());
- } else
- {
- /**
- * BAD_GEN_ID_STATUS or FULL_UPDATE_STATUS:
- * We do not want this to be reported as an error to the update
- * maker -> no pollution or potential misunderstanding when
- * reading logs or monitoring and it was just administration (for
- * instance new server is being configured in topo: it goes in bad
- * gen then then full full update).
- */
}
+ /**
+ * else
+ * BAD_GEN_ID_STATUS or FULL_UPDATE_STATUS:
+ * We do not want this to be reported as an error to the update
+ * maker -> no pollution or potential misunderstanding when
+ * reading logs or monitoring and it was just administration (for
+ * instance new server is being configured in topo: it goes in bad
+ * gen then then full full update).
+ */
}
}
}
@@ -685,19 +684,12 @@
}
} else
{ // A RS sent us the safe data message, for sure no further ack to wait
- if (safeDataLevel == (byte) 1)
+ /**
+ * Level 1 has already been reached so no further acks to wait.
+ * Just deal with level > 1
+ */
+ if (safeDataLevel > (byte) 1)
{
- /**
- * The original level was 1 so the RS that sent us this message
- * should have already sent his ack to the sender DS. Level 1 has
- * already been reached so no further acks to wait.
- * This should not happen in theory as the sender RS server should
- * have sent us a matching not assured message so we should not come
- * to here.
- */
- } else
- {
- // level > 1, so Ack this message to originator RS
sourceHandler.send(new AckMsg(cn));
}
}
@@ -815,11 +807,10 @@
expectedAcksInfo.completed();
}
}
- } else
- {
- // The timeout occurred for the update matching this change number and the
- // ack with timeout error has probably already been sent.
}
+ /* Else the timeout occurred for the update matching this change number
+ * and the ack with timeout error has probably already been sent.
+ */
}
/**
@@ -934,10 +925,8 @@
expectedServerInTimeout.
incrementAssuredSdSentUpdatesTimeout();
}
- } else
- {
- // Server disappeared ? Let's forget about it.
}
+ /* else server disappeared ? Let's forget about it. */
}
}
// Mark the ack info object as completed to prevent potential
--
Gitblit v1.10.0