From 7e6c6657bced35f4a3aba723c2add20923450ad6 Mon Sep 17 00:00:00 2001
From: gbellato <gbellato@localhost>
Date: Fri, 01 Feb 2008 09:40:56 +0000
Subject: [PATCH] The Replication Server thread that is reading changes from the database to propagate them to the Directory Servers was sometimes leaving some Database cursors open.
---
opends/src/server/org/opends/server/replication/server/DbHandler.java | 86 ++++++++++++++++++++++++++++--------------
1 files changed, 57 insertions(+), 29 deletions(-)
diff --git a/opends/src/server/org/opends/server/replication/server/DbHandler.java b/opends/src/server/org/opends/server/replication/server/DbHandler.java
index f3aabe9..f6c2c11 100644
--- a/opends/src/server/org/opends/server/replication/server/DbHandler.java
+++ b/opends/src/server/org/opends/server/replication/server/DbHandler.java
@@ -51,6 +51,7 @@
import org.opends.server.replication.server.ReplicationDB.ReplServerDBCursor;
import com.sleepycat.je.DatabaseException;
+import com.sleepycat.je.DeadlockException;
/**
* This class is used for managing the replicationServer database for each
@@ -99,6 +100,9 @@
final static int MSG_QUEUE_HIMARK = 5000;
final static int MSG_QUEUE_LOWMARK = 4000;
+ // The maximum number of retries in case of DatabaseDeadlock Exception.
+ private static final int DEADLOCK_RETRIES = 10;
+
/**
*
* The trim age in milliseconds. Changes record in the change DB that
@@ -285,7 +289,10 @@
}
}
- return new ReplicationIterator(serverId, db, changeNumber);
+ ReplicationIterator it =
+ new ReplicationIterator(serverId, db, changeNumber);
+
+ return it;
}
/**
@@ -397,46 +404,67 @@
return;
int size = 0;
boolean finished = false;
+ boolean done = false;
ChangeNumber trimDate = new ChangeNumber(TimeThread.getTime() - trimage,
(short) 0, (short)0);
- /* the trim is done by group in order to save some CPU and IO bandwidth
- * start the transaction then do a bunch of remove then commit
- */
- ReplServerDBCursor cursor;
+ // In case of deadlock detection by the Database, this thread can
+ // by aborted by a DeadlockException. This is a transient error and
+ // the transaction should be attempted again.
+ // We will try DEADLOCK_RETRIES times before failing.
+ int tries = 0;
+ while ((tries++ < DEADLOCK_RETRIES) && (!done))
+ {
+ /* the trim is done by group in order to save some CPU and IO bandwidth
+ * start the transaction then do a bunch of remove then commit
+ */
+ ReplServerDBCursor cursor;
+ cursor = db.openDeleteCursor();
- cursor = db.openDeleteCursor();
-
- try {
- while ((size < 5000 ) && (!finished))
+ try
{
- ChangeNumber changeNumber = cursor.nextChangeNumber();
- if (changeNumber != null)
+ while ((size < 5000 ) && (!finished))
{
- if ((!changeNumber.equals(lastChange))
- && (changeNumber.older(trimDate)))
+ ChangeNumber changeNumber = cursor.nextChangeNumber();
+ if (changeNumber != null)
{
- size++;
- cursor.delete();
+ if ((!changeNumber.equals(lastChange))
+ && (changeNumber.older(trimDate)))
+ {
+ size++;
+ cursor.delete();
+ }
+ else
+ {
+ firstChange = changeNumber;
+ finished = true;
+ }
}
else
- {
- firstChange = changeNumber;
finished = true;
- }
}
- else
- finished = true;
+ cursor.close();
+ done = true;
}
-
- cursor.close();
- } catch (DatabaseException e)
- {
- // mark shutdown for this db so that we don't try again to
- // stop it from cursor.close() or methods called by cursor.close()
- shutdown = true;
- cursor.close();
- throw (e);
+ catch (DeadlockException e)
+ {
+ cursor.abort();
+ if (tries == DEADLOCK_RETRIES)
+ {
+ // could not handle the Deadlock after DEADLOCK_RETRIES tries.
+ // shutdown the ReplicationServer.
+ shutdown = true;
+ throw (e);
+ }
+ }
+ catch (DatabaseException e)
+ {
+ // mark shutdown for this db so that we don't try again to
+ // stop it from cursor.close() or methods called by cursor.close()
+ shutdown = true;
+ cursor.abort();
+ throw (e);
+ }
}
}
--
Gitblit v1.10.0