001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 package org.apache.hadoop.ha;
020
021 import java.io.IOException;
022 import java.util.Arrays;
023 import java.util.List;
024 import java.util.concurrent.locks.Lock;
025 import java.util.concurrent.locks.ReentrantLock;
026
027 import org.apache.commons.logging.Log;
028 import org.apache.commons.logging.LogFactory;
029 import org.apache.hadoop.HadoopIllegalArgumentException;
030 import org.apache.hadoop.classification.InterfaceAudience;
031 import org.apache.hadoop.classification.InterfaceStability;
032 import org.apache.hadoop.util.StringUtils;
033 import org.apache.zookeeper.data.ACL;
034 import org.apache.zookeeper.KeeperException;
035 import org.apache.zookeeper.Watcher;
036 import org.apache.zookeeper.WatchedEvent;
037 import org.apache.zookeeper.Watcher.Event;
038 import org.apache.zookeeper.ZKUtil;
039 import org.apache.zookeeper.ZooKeeper;
040 import org.apache.zookeeper.CreateMode;
041 import org.apache.zookeeper.AsyncCallback.*;
042 import org.apache.zookeeper.data.Stat;
043 import org.apache.zookeeper.KeeperException.Code;
044
045 import com.google.common.annotations.VisibleForTesting;
046 import com.google.common.base.Preconditions;
047
048 /**
049 *
050 * This class implements a simple library to perform leader election on top of
051 * Apache Zookeeper. Using Zookeeper as a coordination service, leader election
052 * can be performed by atomically creating an ephemeral lock file (znode) on
053 * Zookeeper. The service instance that successfully creates the znode becomes
054 * active and the rest become standbys. <br/>
055 * This election mechanism is only efficient for small number of election
056 * candidates (order of 10's) because contention on single znode by a large
057 * number of candidates can result in Zookeeper overload. <br/>
058 * The elector does not guarantee fencing (protection of shared resources) among
059 * service instances. After it has notified an instance about becoming a leader,
060 * then that instance must ensure that it meets the service consistency
061 * requirements. If it cannot do so, then it is recommended to quit the
062 * election. The application implements the {@link ActiveStandbyElectorCallback}
063 * to interact with the elector
064 */
065 @InterfaceAudience.Private
066 @InterfaceStability.Evolving
067 public class ActiveStandbyElector implements StatCallback, StringCallback {
068
069 /**
070 * Callback interface to interact with the ActiveStandbyElector object. <br/>
071 * The application will be notified with a callback only on state changes
072 * (i.e. there will never be successive calls to becomeActive without an
073 * intermediate call to enterNeutralMode). <br/>
074 * The callbacks will be running on Zookeeper client library threads. The
075 * application should return from these callbacks quickly so as not to impede
076 * Zookeeper client library performance and notifications. The app will
077 * typically remember the state change and return from the callback. It will
078 * then proceed with implementing actions around that state change. It is
079 * possible to be called back again while these actions are in flight and the
080 * app should handle this scenario.
081 */
082 public interface ActiveStandbyElectorCallback {
083 /**
084 * This method is called when the app becomes the active leader
085 */
086 void becomeActive();
087
088 /**
089 * This method is called when the app becomes a standby
090 */
091 void becomeStandby();
092
093 /**
094 * If the elector gets disconnected from Zookeeper and does not know about
095 * the lock state, then it will notify the service via the enterNeutralMode
096 * interface. The service may choose to ignore this or stop doing state
097 * changing operations. Upon reconnection, the elector verifies the leader
098 * status and calls back on the becomeActive and becomeStandby app
099 * interfaces. <br/>
100 * Zookeeper disconnects can happen due to network issues or loss of
101 * Zookeeper quorum. Thus enterNeutralMode can be used to guard against
102 * split-brain issues. In such situations it might be prudent to call
103 * becomeStandby too. However, such state change operations might be
104 * expensive and enterNeutralMode can help guard against doing that for
105 * transient issues.
106 */
107 void enterNeutralMode();
108
109 /**
110 * If there is any fatal error (e.g. wrong ACL's, unexpected Zookeeper
111 * errors or Zookeeper persistent unavailability) then notifyFatalError is
112 * called to notify the app about it.
113 */
114 void notifyFatalError(String errorMessage);
115
116 /**
117 * If an old active has failed, rather than exited gracefully, then
118 * the new active may need to take some fencing actions against it
119 * before proceeding with failover.
120 *
121 * @param oldActiveData the application data provided by the prior active
122 */
123 void fenceOldActive(byte[] oldActiveData);
124 }
125
126 /**
127 * Name of the lock znode used by the library. Protected for access in test
128 * classes
129 */
130 @VisibleForTesting
131 protected static final String LOCK_FILENAME = "ActiveStandbyElectorLock";
132 @VisibleForTesting
133 protected static final String BREADCRUMB_FILENAME = "ActiveBreadCrumb";
134
135 public static final Log LOG = LogFactory.getLog(ActiveStandbyElector.class);
136
137 private static final int NUM_RETRIES = 3;
138
139 private static enum ConnectionState {
140 DISCONNECTED, CONNECTED, TERMINATED
141 };
142
143 static enum State {
144 INIT, ACTIVE, STANDBY, NEUTRAL
145 };
146
147 private State state = State.INIT;
148 private int createRetryCount = 0;
149 private int statRetryCount = 0;
150 private ZooKeeper zkClient;
151 private ConnectionState zkConnectionState = ConnectionState.TERMINATED;
152
153 private final ActiveStandbyElectorCallback appClient;
154 private final String zkHostPort;
155 private final int zkSessionTimeout;
156 private final List<ACL> zkAcl;
157 private byte[] appData;
158 private final String zkLockFilePath;
159 private final String zkBreadCrumbPath;
160 private final String znodeWorkingDir;
161
162 private Lock sessionReestablishLockForTests = new ReentrantLock();
163 private boolean wantToBeInElection;
164
165 /**
166 * Create a new ActiveStandbyElector object <br/>
167 * The elector is created by providing to it the Zookeeper configuration, the
168 * parent znode under which to create the znode and a reference to the
169 * callback interface. <br/>
170 * The parent znode name must be the same for all service instances and
171 * different across services. <br/>
172 * After the leader has been lost, a new leader will be elected after the
173 * session timeout expires. Hence, the app must set this parameter based on
174 * its needs for failure response time. The session timeout must be greater
175 * than the Zookeeper disconnect timeout and is recommended to be 3X that
176 * value to enable Zookeeper to retry transient disconnections. Setting a very
177 * short session timeout may result in frequent transitions between active and
178 * standby states during issues like network outages/GS pauses.
179 *
180 * @param zookeeperHostPorts
181 * ZooKeeper hostPort for all ZooKeeper servers
182 * @param zookeeperSessionTimeout
183 * ZooKeeper session timeout
184 * @param parentZnodeName
185 * znode under which to create the lock
186 * @param acl
187 * ZooKeeper ACL's
188 * @param app
189 * reference to callback interface object
190 * @throws IOException
191 * @throws HadoopIllegalArgumentException
192 */
193 public ActiveStandbyElector(String zookeeperHostPorts,
194 int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl,
195 ActiveStandbyElectorCallback app) throws IOException,
196 HadoopIllegalArgumentException {
197 if (app == null || acl == null || parentZnodeName == null
198 || zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) {
199 throw new HadoopIllegalArgumentException("Invalid argument");
200 }
201 zkHostPort = zookeeperHostPorts;
202 zkSessionTimeout = zookeeperSessionTimeout;
203 zkAcl = acl;
204 appClient = app;
205 znodeWorkingDir = parentZnodeName;
206 zkLockFilePath = znodeWorkingDir + "/" + LOCK_FILENAME;
207 zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME;
208
209 // createConnection for future API calls
210 createConnection();
211 }
212
213 /**
214 * To participate in election, the app will call joinElection. The result will
215 * be notified by a callback on either the becomeActive or becomeStandby app
216 * interfaces. <br/>
217 * After this the elector will automatically monitor the leader status and
218 * perform re-election if necessary<br/>
219 * The app could potentially start off in standby mode and ignore the
220 * becomeStandby call.
221 *
222 * @param data
223 * to be set by the app. non-null data must be set.
224 * @throws HadoopIllegalArgumentException
225 * if valid data is not supplied
226 */
227 public synchronized void joinElection(byte[] data)
228 throws HadoopIllegalArgumentException {
229
230 LOG.debug("Attempting active election");
231
232 if (data == null) {
233 throw new HadoopIllegalArgumentException("data cannot be null");
234 }
235
236 appData = new byte[data.length];
237 System.arraycopy(data, 0, appData, 0, data.length);
238
239 joinElectionInternal();
240 }
241
242 /**
243 * @return true if the configured parent znode exists
244 */
245 public synchronized boolean parentZNodeExists()
246 throws IOException, InterruptedException {
247 Preconditions.checkState(zkClient != null);
248 try {
249 return zkClient.exists(znodeWorkingDir, false) != null;
250 } catch (KeeperException e) {
251 throw new IOException("Couldn't determine existence of znode '" +
252 znodeWorkingDir + "'", e);
253 }
254 }
255
256 /**
257 * Utility function to ensure that the configured base znode exists.
258 * This recursively creates the znode as well as all of its parents.
259 */
260 public synchronized void ensureParentZNode()
261 throws IOException, InterruptedException {
262 String pathParts[] = znodeWorkingDir.split("/");
263 Preconditions.checkArgument(pathParts.length >= 1 &&
264 "".equals(pathParts[0]),
265 "Invalid path: %s", znodeWorkingDir);
266
267 StringBuilder sb = new StringBuilder();
268 for (int i = 1; i < pathParts.length; i++) {
269 sb.append("/").append(pathParts[i]);
270 String prefixPath = sb.toString();
271 LOG.debug("Ensuring existence of " + prefixPath);
272 try {
273 createWithRetries(prefixPath, new byte[]{}, zkAcl, CreateMode.PERSISTENT);
274 } catch (KeeperException e) {
275 if (isNodeExists(e.code())) {
276 // This is OK - just ensuring existence.
277 continue;
278 } else {
279 throw new IOException("Couldn't create " + prefixPath, e);
280 }
281 }
282 }
283
284 LOG.info("Successfully created " + znodeWorkingDir + " in ZK.");
285 }
286
287 /**
288 * Clear all of the state held within the parent ZNode.
289 * This recursively deletes everything within the znode as well as the
290 * parent znode itself. It should only be used when it's certain that
291 * no electors are currently participating in the election.
292 */
293 public synchronized void clearParentZNode()
294 throws IOException, InterruptedException {
295 try {
296 LOG.info("Recursively deleting " + znodeWorkingDir + " from ZK...");
297
298 zkDoWithRetries(new ZKAction<Void>() {
299 @Override
300 public Void run() throws KeeperException, InterruptedException {
301 ZKUtil.deleteRecursive(zkClient, znodeWorkingDir);
302 return null;
303 }
304 });
305 } catch (KeeperException e) {
306 throw new IOException("Couldn't clear parent znode " + znodeWorkingDir,
307 e);
308 }
309 LOG.info("Successfully deleted " + znodeWorkingDir + " from ZK.");
310 }
311
312
313 /**
314 * Any service instance can drop out of the election by calling quitElection.
315 * <br/>
316 * This will lose any leader status, if held, and stop monitoring of the lock
317 * node. <br/>
318 * If the instance wants to participate in election again, then it needs to
319 * call joinElection(). <br/>
320 * This allows service instances to take themselves out of rotation for known
321 * impending unavailable states (e.g. long GC pause or software upgrade).
322 *
323 * @param needFence true if the underlying daemon may need to be fenced
324 * if a failover occurs due to dropping out of the election.
325 */
326 public synchronized void quitElection(boolean needFence) {
327 LOG.info("Yielding from election");
328 if (!needFence && state == State.ACTIVE) {
329 // If active is gracefully going back to standby mode, remove
330 // our permanent znode so no one fences us.
331 tryDeleteOwnBreadCrumbNode();
332 }
333 reset();
334 wantToBeInElection = false;
335 }
336
337 /**
338 * Exception thrown when there is no active leader
339 */
340 public static class ActiveNotFoundException extends Exception {
341 private static final long serialVersionUID = 3505396722342846462L;
342 }
343
344 /**
345 * get data set by the active leader
346 *
347 * @return data set by the active instance
348 * @throws ActiveNotFoundException
349 * when there is no active leader
350 * @throws KeeperException
351 * other zookeeper operation errors
352 * @throws InterruptedException
353 * @throws IOException
354 * when ZooKeeper connection could not be established
355 */
356 public synchronized byte[] getActiveData() throws ActiveNotFoundException,
357 KeeperException, InterruptedException, IOException {
358 try {
359 if (zkClient == null) {
360 createConnection();
361 }
362 Stat stat = new Stat();
363 return zkClient.getData(zkLockFilePath, false, stat);
364 } catch(KeeperException e) {
365 Code code = e.code();
366 if (isNodeDoesNotExist(code)) {
367 // handle the commonly expected cases that make sense for us
368 throw new ActiveNotFoundException();
369 } else {
370 throw e;
371 }
372 }
373 }
374
375 /**
376 * interface implementation of Zookeeper callback for create
377 */
378 @Override
379 public synchronized void processResult(int rc, String path, Object ctx,
380 String name) {
381 if (isStaleClient(ctx)) return;
382 LOG.debug("CreateNode result: " + rc + " for path: " + path
383 + " connectionState: " + zkConnectionState);
384
385 Code code = Code.get(rc);
386 if (isSuccess(code)) {
387 // we successfully created the znode. we are the leader. start monitoring
388 becomeActive();
389 monitorActiveStatus();
390 return;
391 }
392
393 if (isNodeExists(code)) {
394 if (createRetryCount == 0) {
395 // znode exists and we did not retry the operation. so a different
396 // instance has created it. become standby and monitor lock.
397 becomeStandby();
398 }
399 // if we had retried then the znode could have been created by our first
400 // attempt to the server (that we lost) and this node exists response is
401 // for the second attempt. verify this case via ephemeral node owner. this
402 // will happen on the callback for monitoring the lock.
403 monitorActiveStatus();
404 return;
405 }
406
407 String errorMessage = "Received create error from Zookeeper. code:"
408 + code.toString() + " for path " + path;
409 LOG.debug(errorMessage);
410
411 if (shouldRetry(code)) {
412 if (createRetryCount < NUM_RETRIES) {
413 LOG.debug("Retrying createNode createRetryCount: " + createRetryCount);
414 ++createRetryCount;
415 createLockNodeAsync();
416 return;
417 }
418 errorMessage = errorMessage
419 + ". Not retrying further znode create connection errors.";
420 } else if (isSessionExpired(code)) {
421 // This isn't fatal - the client Watcher will re-join the election
422 LOG.warn("Lock acquisition failed because session was lost");
423 return;
424 }
425
426 fatalError(errorMessage);
427 }
428
429 /**
430 * interface implementation of Zookeeper callback for monitor (exists)
431 */
432 @Override
433 public synchronized void processResult(int rc, String path, Object ctx,
434 Stat stat) {
435 if (isStaleClient(ctx)) return;
436 LOG.debug("StatNode result: " + rc + " for path: " + path
437 + " connectionState: " + zkConnectionState);
438
439 Code code = Code.get(rc);
440 if (isSuccess(code)) {
441 // the following owner check completes verification in case the lock znode
442 // creation was retried
443 if (stat.getEphemeralOwner() == zkClient.getSessionId()) {
444 // we own the lock znode. so we are the leader
445 becomeActive();
446 } else {
447 // we dont own the lock znode. so we are a standby.
448 becomeStandby();
449 }
450 // the watch set by us will notify about changes
451 return;
452 }
453
454 if (isNodeDoesNotExist(code)) {
455 // the lock znode disappeared before we started monitoring it
456 enterNeutralMode();
457 joinElectionInternal();
458 return;
459 }
460
461 String errorMessage = "Received stat error from Zookeeper. code:"
462 + code.toString();
463 LOG.debug(errorMessage);
464
465 if (shouldRetry(code)) {
466 if (statRetryCount < NUM_RETRIES) {
467 ++statRetryCount;
468 monitorLockNodeAsync();
469 return;
470 }
471 errorMessage = errorMessage
472 + ". Not retrying further znode monitoring connection errors.";
473 }
474
475 fatalError(errorMessage);
476 }
477
478 /**
479 * interface implementation of Zookeeper watch events (connection and node)
480 */
481 synchronized void processWatchEvent(ZooKeeper zk, WatchedEvent event) {
482 Event.EventType eventType = event.getType();
483 if (isStaleClient(zk)) return;
484 LOG.debug("Watcher event type: " + eventType + " with state:"
485 + event.getState() + " for path:" + event.getPath()
486 + " connectionState: " + zkConnectionState);
487
488 if (eventType == Event.EventType.None) {
489 // the connection state has changed
490 switch (event.getState()) {
491 case SyncConnected:
492 LOG.info("Session connected.");
493 // if the listener was asked to move to safe state then it needs to
494 // be undone
495 ConnectionState prevConnectionState = zkConnectionState;
496 zkConnectionState = ConnectionState.CONNECTED;
497 if (prevConnectionState == ConnectionState.DISCONNECTED) {
498 monitorActiveStatus();
499 }
500 break;
501 case Disconnected:
502 LOG.info("Session disconnected. Entering neutral mode...");
503
504 // ask the app to move to safe state because zookeeper connection
505 // is not active and we dont know our state
506 zkConnectionState = ConnectionState.DISCONNECTED;
507 enterNeutralMode();
508 break;
509 case Expired:
510 // the connection got terminated because of session timeout
511 // call listener to reconnect
512 LOG.info("Session expired. Entering neutral mode and rejoining...");
513 enterNeutralMode();
514 reJoinElection();
515 break;
516 default:
517 fatalError("Unexpected Zookeeper watch event state: "
518 + event.getState());
519 break;
520 }
521
522 return;
523 }
524
525 // a watch on lock path in zookeeper has fired. so something has changed on
526 // the lock. ideally we should check that the path is the same as the lock
527 // path but trusting zookeeper for now
528 String path = event.getPath();
529 if (path != null) {
530 switch (eventType) {
531 case NodeDeleted:
532 if (state == State.ACTIVE) {
533 enterNeutralMode();
534 }
535 joinElectionInternal();
536 break;
537 case NodeDataChanged:
538 monitorActiveStatus();
539 break;
540 default:
541 LOG.debug("Unexpected node event: " + eventType + " for path: " + path);
542 monitorActiveStatus();
543 }
544
545 return;
546 }
547
548 // some unexpected error has occurred
549 fatalError("Unexpected watch error from Zookeeper");
550 }
551
552 /**
553 * Get a new zookeeper client instance. protected so that test class can
554 * inherit and pass in a mock object for zookeeper
555 *
556 * @return new zookeeper client instance
557 * @throws IOException
558 */
559 protected synchronized ZooKeeper getNewZooKeeper() throws IOException {
560 ZooKeeper zk = new ZooKeeper(zkHostPort, zkSessionTimeout, null);
561 zk.register(new WatcherWithClientRef(zk));
562 return zk;
563 }
564
565 private void fatalError(String errorMessage) {
566 reset();
567 appClient.notifyFatalError(errorMessage);
568 }
569
570 private void monitorActiveStatus() {
571 LOG.debug("Monitoring active leader");
572 statRetryCount = 0;
573 monitorLockNodeAsync();
574 }
575
576 private void joinElectionInternal() {
577 if (zkClient == null) {
578 if (!reEstablishSession()) {
579 fatalError("Failed to reEstablish connection with ZooKeeper");
580 return;
581 }
582 }
583
584 createRetryCount = 0;
585 wantToBeInElection = true;
586 createLockNodeAsync();
587 }
588
589 private void reJoinElection() {
590 LOG.info("Trying to re-establish ZK session");
591
592 // Some of the test cases rely on expiring the ZK sessions and
593 // ensuring that the other node takes over. But, there's a race
594 // where the original lease holder could reconnect faster than the other
595 // thread manages to take the lock itself. This lock allows the
596 // tests to block the reconnection. It's a shame that this leaked
597 // into non-test code, but the lock is only acquired here so will never
598 // be contended.
599 sessionReestablishLockForTests.lock();
600 try {
601 terminateConnection();
602 joinElectionInternal();
603 } finally {
604 sessionReestablishLockForTests.unlock();
605 }
606 }
607
608 @VisibleForTesting
609 void preventSessionReestablishmentForTests() {
610 sessionReestablishLockForTests.lock();
611 }
612
613 @VisibleForTesting
614 void allowSessionReestablishmentForTests() {
615 sessionReestablishLockForTests.unlock();
616 }
617
618 @VisibleForTesting
619 long getZKSessionIdForTests() {
620 return zkClient.getSessionId();
621 }
622
623 @VisibleForTesting
624 synchronized State getStateForTests() {
625 return state;
626 }
627
628 private boolean reEstablishSession() {
629 int connectionRetryCount = 0;
630 boolean success = false;
631 while(!success && connectionRetryCount < NUM_RETRIES) {
632 LOG.debug("Establishing zookeeper connection");
633 try {
634 createConnection();
635 success = true;
636 } catch(IOException e) {
637 LOG.warn(e);
638 try {
639 Thread.sleep(5000);
640 } catch(InterruptedException e1) {
641 LOG.warn(e1);
642 }
643 }
644 ++connectionRetryCount;
645 }
646 return success;
647 }
648
649 private void createConnection() throws IOException {
650 zkClient = getNewZooKeeper();
651 }
652
653 private void terminateConnection() {
654 if (zkClient == null) {
655 return;
656 }
657 LOG.debug("Terminating ZK connection");
658 ZooKeeper tempZk = zkClient;
659 zkClient = null;
660 try {
661 tempZk.close();
662 } catch(InterruptedException e) {
663 LOG.warn(e);
664 }
665 zkConnectionState = ConnectionState.TERMINATED;
666 }
667
668 private void reset() {
669 state = State.INIT;
670 terminateConnection();
671 }
672
673 private void becomeActive() {
674 assert wantToBeInElection;
675 if (state != State.ACTIVE) {
676 try {
677 Stat oldBreadcrumbStat = fenceOldActive();
678 writeBreadCrumbNode(oldBreadcrumbStat);
679 } catch (Exception e) {
680 LOG.warn("Exception handling the winning of election", e);
681 reJoinElection();
682 return;
683 }
684 LOG.debug("Becoming active");
685 state = State.ACTIVE;
686 appClient.becomeActive();
687 }
688 }
689
690 /**
691 * Write the "ActiveBreadCrumb" node, indicating that this node may need
692 * to be fenced on failover.
693 * @param oldBreadcrumbStat
694 */
695 private void writeBreadCrumbNode(Stat oldBreadcrumbStat)
696 throws KeeperException, InterruptedException {
697 LOG.info("Writing znode " + zkBreadCrumbPath +
698 " to indicate that the local node is the most recent active...");
699 if (oldBreadcrumbStat == null) {
700 // No previous active, just create the node
701 createWithRetries(zkBreadCrumbPath, appData, zkAcl,
702 CreateMode.PERSISTENT);
703 } else {
704 // There was a previous active, update the node
705 setDataWithRetries(zkBreadCrumbPath, appData, oldBreadcrumbStat.getVersion());
706 }
707 }
708
709 /**
710 * Try to delete the "ActiveBreadCrumb" node when gracefully giving up
711 * active status.
712 * If this fails, it will simply warn, since the graceful release behavior
713 * is only an optimization.
714 */
715 private void tryDeleteOwnBreadCrumbNode() {
716 assert state == State.ACTIVE;
717 LOG.info("Deleting bread-crumb of active node...");
718
719 // Sanity check the data. This shouldn't be strictly necessary,
720 // but better to play it safe.
721 Stat stat = new Stat();
722 byte[] data = null;
723 try {
724 data = zkClient.getData(zkBreadCrumbPath, false, stat);
725
726 if (!Arrays.equals(data, appData)) {
727 throw new IllegalStateException(
728 "We thought we were active, but in fact " +
729 "the active znode had the wrong data: " +
730 StringUtils.byteToHexString(data) + " (stat=" + stat + ")");
731 }
732
733 deleteWithRetries(zkBreadCrumbPath, stat.getVersion());
734 } catch (Exception e) {
735 LOG.warn("Unable to delete our own bread-crumb of being active at " +
736 zkBreadCrumbPath + ": " + e.getLocalizedMessage() + ". " +
737 "Expecting to be fenced by the next active.");
738 }
739 }
740
741 /**
742 * If there is a breadcrumb node indicating that another node may need
743 * fencing, try to fence that node.
744 * @return the Stat of the breadcrumb node that was read, or null
745 * if no breadcrumb node existed
746 */
747 private Stat fenceOldActive() throws InterruptedException, KeeperException {
748 final Stat stat = new Stat();
749 byte[] data;
750 LOG.info("Checking for any old active which needs to be fenced...");
751 try {
752 data = zkDoWithRetries(new ZKAction<byte[]>() {
753 @Override
754 public byte[] run() throws KeeperException, InterruptedException {
755 return zkClient.getData(zkBreadCrumbPath, false, stat);
756 }
757 });
758 } catch (KeeperException ke) {
759 if (isNodeDoesNotExist(ke.code())) {
760 LOG.info("No old node to fence");
761 return null;
762 }
763
764 // If we failed to read for any other reason, then likely we lost
765 // our session, or we don't have permissions, etc. In any case,
766 // we probably shouldn't become active, and failing the whole
767 // thing is the best bet.
768 throw ke;
769 }
770
771 LOG.info("Old node exists: " + StringUtils.byteToHexString(data));
772 if (Arrays.equals(data, appData)) {
773 LOG.info("But old node has our own data, so don't need to fence it.");
774 } else {
775 appClient.fenceOldActive(data);
776 }
777 return stat;
778 }
779
780 private void becomeStandby() {
781 if (state != State.STANDBY) {
782 LOG.debug("Becoming standby");
783 state = State.STANDBY;
784 appClient.becomeStandby();
785 }
786 }
787
788 private void enterNeutralMode() {
789 if (state != State.NEUTRAL) {
790 LOG.debug("Entering neutral mode");
791 state = State.NEUTRAL;
792 appClient.enterNeutralMode();
793 }
794 }
795
796 private void createLockNodeAsync() {
797 zkClient.create(zkLockFilePath, appData, zkAcl, CreateMode.EPHEMERAL,
798 this, zkClient);
799 }
800
801 private void monitorLockNodeAsync() {
802 zkClient.exists(zkLockFilePath,
803 new WatcherWithClientRef(zkClient), this,
804 zkClient);
805 }
806
807 private String createWithRetries(final String path, final byte[] data,
808 final List<ACL> acl, final CreateMode mode)
809 throws InterruptedException, KeeperException {
810 return zkDoWithRetries(new ZKAction<String>() {
811 public String run() throws KeeperException, InterruptedException {
812 return zkClient.create(path, data, acl, mode);
813 }
814 });
815 }
816
817 private Stat setDataWithRetries(final String path, final byte[] data,
818 final int version) throws InterruptedException, KeeperException {
819 return zkDoWithRetries(new ZKAction<Stat>() {
820 public Stat run() throws KeeperException, InterruptedException {
821 return zkClient.setData(path, data, version);
822 }
823 });
824 }
825
826 private void deleteWithRetries(final String path, final int version)
827 throws KeeperException, InterruptedException {
828 zkDoWithRetries(new ZKAction<Void>() {
829 public Void run() throws KeeperException, InterruptedException {
830 zkClient.delete(path, version);
831 return null;
832 }
833 });
834 }
835
836 private static <T> T zkDoWithRetries(ZKAction<T> action)
837 throws KeeperException, InterruptedException {
838 int retry = 0;
839 while (true) {
840 try {
841 return action.run();
842 } catch (KeeperException ke) {
843 if (shouldRetry(ke.code()) && ++retry < NUM_RETRIES) {
844 continue;
845 }
846 throw ke;
847 }
848 }
849 }
850
851 private interface ZKAction<T> {
852 T run() throws KeeperException, InterruptedException;
853 }
854
855 /**
856 * The callbacks and watchers pass a reference to the ZK client
857 * which made the original call. We don't want to take action
858 * based on any callbacks from prior clients after we quit
859 * the election.
860 * @param ctx the ZK client passed into the watcher
861 * @return true if it matches the current client
862 */
863 private synchronized boolean isStaleClient(Object ctx) {
864 Preconditions.checkNotNull(ctx);
865 if (zkClient != (ZooKeeper)ctx) {
866 LOG.warn("Ignoring stale result from old client with sessionId " +
867 String.format("0x%08x", ((ZooKeeper)ctx).getSessionId()));
868 return true;
869 }
870 return false;
871 }
872
873 /**
874 * Watcher implementation which keeps a reference around to the
875 * original ZK connection, and passes it back along with any
876 * events.
877 */
878 private final class WatcherWithClientRef implements Watcher {
879 private final ZooKeeper zk;
880
881 private WatcherWithClientRef(ZooKeeper zk) {
882 this.zk = zk;
883 }
884
885 @Override
886 public void process(WatchedEvent event) {
887 ActiveStandbyElector.this.processWatchEvent(
888 zk, event);
889 }
890 }
891
892 private static boolean isSuccess(Code code) {
893 return (code == Code.OK);
894 }
895
896 private static boolean isNodeExists(Code code) {
897 return (code == Code.NODEEXISTS);
898 }
899
900 private static boolean isNodeDoesNotExist(Code code) {
901 return (code == Code.NONODE);
902 }
903
904 private static boolean isSessionExpired(Code code) {
905 return (code == Code.SESSIONEXPIRED);
906 }
907
908 private static boolean shouldRetry(Code code) {
909 switch (code) {
910 case CONNECTIONLOSS:
911 case OPERATIONTIMEOUT:
912 return true;
913 }
914 return false;
915 }
916
917 }