001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.ha;
020    
021    import java.io.IOException;
022    import java.util.Arrays;
023    import java.util.List;
024    import java.util.concurrent.locks.Lock;
025    import java.util.concurrent.locks.ReentrantLock;
026    
027    import org.apache.commons.logging.Log;
028    import org.apache.commons.logging.LogFactory;
029    import org.apache.hadoop.HadoopIllegalArgumentException;
030    import org.apache.hadoop.classification.InterfaceAudience;
031    import org.apache.hadoop.classification.InterfaceStability;
032    import org.apache.hadoop.util.StringUtils;
033    import org.apache.zookeeper.data.ACL;
034    import org.apache.zookeeper.KeeperException;
035    import org.apache.zookeeper.Watcher;
036    import org.apache.zookeeper.WatchedEvent;
037    import org.apache.zookeeper.Watcher.Event;
038    import org.apache.zookeeper.ZKUtil;
039    import org.apache.zookeeper.ZooKeeper;
040    import org.apache.zookeeper.CreateMode;
041    import org.apache.zookeeper.AsyncCallback.*;
042    import org.apache.zookeeper.data.Stat;
043    import org.apache.zookeeper.KeeperException.Code;
044    
045    import com.google.common.annotations.VisibleForTesting;
046    import com.google.common.base.Preconditions;
047    
048    /**
049     * 
050     * This class implements a simple library to perform leader election on top of
051     * Apache Zookeeper. Using Zookeeper as a coordination service, leader election
052     * can be performed by atomically creating an ephemeral lock file (znode) on
053     * Zookeeper. The service instance that successfully creates the znode becomes
054     * active and the rest become standbys. <br/>
055     * This election mechanism is only efficient for small number of election
056     * candidates (order of 10's) because contention on single znode by a large
057     * number of candidates can result in Zookeeper overload. <br/>
058     * The elector does not guarantee fencing (protection of shared resources) among
059     * service instances. After it has notified an instance about becoming a leader,
060     * then that instance must ensure that it meets the service consistency
061     * requirements. If it cannot do so, then it is recommended to quit the
062     * election. The application implements the {@link ActiveStandbyElectorCallback}
063     * to interact with the elector
064     */
065    @InterfaceAudience.Private
066    @InterfaceStability.Evolving
067    public class ActiveStandbyElector implements StatCallback, StringCallback {
068    
069      /**
070       * Callback interface to interact with the ActiveStandbyElector object. <br/>
071       * The application will be notified with a callback only on state changes
072       * (i.e. there will never be successive calls to becomeActive without an
073       * intermediate call to enterNeutralMode). <br/>
074       * The callbacks will be running on Zookeeper client library threads. The
075       * application should return from these callbacks quickly so as not to impede
076       * Zookeeper client library performance and notifications. The app will
077       * typically remember the state change and return from the callback. It will
078       * then proceed with implementing actions around that state change. It is
079       * possible to be called back again while these actions are in flight and the
080       * app should handle this scenario.
081       */
082      public interface ActiveStandbyElectorCallback {
083        /**
084         * This method is called when the app becomes the active leader
085         */
086        void becomeActive();
087    
088        /**
089         * This method is called when the app becomes a standby
090         */
091        void becomeStandby();
092    
093        /**
094         * If the elector gets disconnected from Zookeeper and does not know about
095         * the lock state, then it will notify the service via the enterNeutralMode
096         * interface. The service may choose to ignore this or stop doing state
097         * changing operations. Upon reconnection, the elector verifies the leader
098         * status and calls back on the becomeActive and becomeStandby app
099         * interfaces. <br/>
100         * Zookeeper disconnects can happen due to network issues or loss of
101         * Zookeeper quorum. Thus enterNeutralMode can be used to guard against
102         * split-brain issues. In such situations it might be prudent to call
103         * becomeStandby too. However, such state change operations might be
104         * expensive and enterNeutralMode can help guard against doing that for
105         * transient issues.
106         */
107        void enterNeutralMode();
108    
109        /**
110         * If there is any fatal error (e.g. wrong ACL's, unexpected Zookeeper
111         * errors or Zookeeper persistent unavailability) then notifyFatalError is
112         * called to notify the app about it.
113         */
114        void notifyFatalError(String errorMessage);
115    
116        /**
117         * If an old active has failed, rather than exited gracefully, then
118         * the new active may need to take some fencing actions against it
119         * before proceeding with failover.
120         * 
121         * @param oldActiveData the application data provided by the prior active
122         */
123        void fenceOldActive(byte[] oldActiveData);
124      }
125    
126      /**
127       * Name of the lock znode used by the library. Protected for access in test
128       * classes
129       */
130      @VisibleForTesting
131      protected static final String LOCK_FILENAME = "ActiveStandbyElectorLock";
132      @VisibleForTesting
133      protected static final String BREADCRUMB_FILENAME = "ActiveBreadCrumb";
134    
135      public static final Log LOG = LogFactory.getLog(ActiveStandbyElector.class);
136    
137      private static final int NUM_RETRIES = 3;
138    
139      private static enum ConnectionState {
140        DISCONNECTED, CONNECTED, TERMINATED
141      };
142    
143      static enum State {
144        INIT, ACTIVE, STANDBY, NEUTRAL
145      };
146    
147      private State state = State.INIT;
148      private int createRetryCount = 0;
149      private int statRetryCount = 0;
150      private ZooKeeper zkClient;
151      private ConnectionState zkConnectionState = ConnectionState.TERMINATED;
152    
153      private final ActiveStandbyElectorCallback appClient;
154      private final String zkHostPort;
155      private final int zkSessionTimeout;
156      private final List<ACL> zkAcl;
157      private byte[] appData;
158      private final String zkLockFilePath;
159      private final String zkBreadCrumbPath;
160      private final String znodeWorkingDir;
161    
162      private Lock sessionReestablishLockForTests = new ReentrantLock();
163      private boolean wantToBeInElection;
164      
165      /**
166       * Create a new ActiveStandbyElector object <br/>
167       * The elector is created by providing to it the Zookeeper configuration, the
168       * parent znode under which to create the znode and a reference to the
169       * callback interface. <br/>
170       * The parent znode name must be the same for all service instances and
171       * different across services. <br/>
172       * After the leader has been lost, a new leader will be elected after the
173       * session timeout expires. Hence, the app must set this parameter based on
174       * its needs for failure response time. The session timeout must be greater
175       * than the Zookeeper disconnect timeout and is recommended to be 3X that
176       * value to enable Zookeeper to retry transient disconnections. Setting a very
177       * short session timeout may result in frequent transitions between active and
178       * standby states during issues like network outages/GS pauses.
179       * 
180       * @param zookeeperHostPorts
181       *          ZooKeeper hostPort for all ZooKeeper servers
182       * @param zookeeperSessionTimeout
183       *          ZooKeeper session timeout
184       * @param parentZnodeName
185       *          znode under which to create the lock
186       * @param acl
187       *          ZooKeeper ACL's
188       * @param app
189       *          reference to callback interface object
190       * @throws IOException
191       * @throws HadoopIllegalArgumentException
192       */
193      public ActiveStandbyElector(String zookeeperHostPorts,
194          int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl,
195          ActiveStandbyElectorCallback app) throws IOException,
196          HadoopIllegalArgumentException {
197        if (app == null || acl == null || parentZnodeName == null
198            || zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) {
199          throw new HadoopIllegalArgumentException("Invalid argument");
200        }
201        zkHostPort = zookeeperHostPorts;
202        zkSessionTimeout = zookeeperSessionTimeout;
203        zkAcl = acl;
204        appClient = app;
205        znodeWorkingDir = parentZnodeName;
206        zkLockFilePath = znodeWorkingDir + "/" + LOCK_FILENAME;
207        zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME;    
208    
209        // createConnection for future API calls
210        createConnection();
211      }
212    
213      /**
214       * To participate in election, the app will call joinElection. The result will
215       * be notified by a callback on either the becomeActive or becomeStandby app
216       * interfaces. <br/>
217       * After this the elector will automatically monitor the leader status and
218       * perform re-election if necessary<br/>
219       * The app could potentially start off in standby mode and ignore the
220       * becomeStandby call.
221       * 
222       * @param data
223       *          to be set by the app. non-null data must be set.
224       * @throws HadoopIllegalArgumentException
225       *           if valid data is not supplied
226       */
227      public synchronized void joinElection(byte[] data)
228          throws HadoopIllegalArgumentException {
229        
230        LOG.debug("Attempting active election");
231    
232        if (data == null) {
233          throw new HadoopIllegalArgumentException("data cannot be null");
234        }
235    
236        appData = new byte[data.length];
237        System.arraycopy(data, 0, appData, 0, data.length);
238    
239        joinElectionInternal();
240      }
241      
242      /**
243       * @return true if the configured parent znode exists
244       */
245      public synchronized boolean parentZNodeExists()
246          throws IOException, InterruptedException {
247        Preconditions.checkState(zkClient != null);
248        try {
249          return zkClient.exists(znodeWorkingDir, false) != null;
250        } catch (KeeperException e) {
251          throw new IOException("Couldn't determine existence of znode '" +
252              znodeWorkingDir + "'", e);
253        }
254      }
255    
256      /**
257       * Utility function to ensure that the configured base znode exists.
258       * This recursively creates the znode as well as all of its parents.
259       */
260      public synchronized void ensureParentZNode()
261          throws IOException, InterruptedException {
262        String pathParts[] = znodeWorkingDir.split("/");
263        Preconditions.checkArgument(pathParts.length >= 1 &&
264            "".equals(pathParts[0]),
265            "Invalid path: %s", znodeWorkingDir);
266        
267        StringBuilder sb = new StringBuilder();
268        for (int i = 1; i < pathParts.length; i++) {
269          sb.append("/").append(pathParts[i]);
270          String prefixPath = sb.toString();
271          LOG.debug("Ensuring existence of " + prefixPath);
272          try {
273            createWithRetries(prefixPath, new byte[]{}, zkAcl, CreateMode.PERSISTENT);
274          } catch (KeeperException e) {
275            if (isNodeExists(e.code())) {
276              // This is OK - just ensuring existence.
277              continue;
278            } else {
279              throw new IOException("Couldn't create " + prefixPath, e);
280            }
281          }
282        }
283        
284        LOG.info("Successfully created " + znodeWorkingDir + " in ZK.");
285      }
286      
287      /**
288       * Clear all of the state held within the parent ZNode.
289       * This recursively deletes everything within the znode as well as the
290       * parent znode itself. It should only be used when it's certain that
291       * no electors are currently participating in the election.
292       */
293      public synchronized void clearParentZNode()
294          throws IOException, InterruptedException {
295        try {
296          LOG.info("Recursively deleting " + znodeWorkingDir + " from ZK...");
297    
298          zkDoWithRetries(new ZKAction<Void>() {
299            @Override
300            public Void run() throws KeeperException, InterruptedException {
301              ZKUtil.deleteRecursive(zkClient, znodeWorkingDir);
302              return null;
303            }
304          });
305        } catch (KeeperException e) {
306          throw new IOException("Couldn't clear parent znode " + znodeWorkingDir,
307              e);
308        }
309        LOG.info("Successfully deleted " + znodeWorkingDir + " from ZK.");
310      }
311    
312    
313      /**
314       * Any service instance can drop out of the election by calling quitElection. 
315       * <br/>
316       * This will lose any leader status, if held, and stop monitoring of the lock
317       * node. <br/>
318       * If the instance wants to participate in election again, then it needs to
319       * call joinElection(). <br/>
320       * This allows service instances to take themselves out of rotation for known
321       * impending unavailable states (e.g. long GC pause or software upgrade).
322       * 
323       * @param needFence true if the underlying daemon may need to be fenced
324       * if a failover occurs due to dropping out of the election.
325       */
326      public synchronized void quitElection(boolean needFence) {
327        LOG.info("Yielding from election");
328        if (!needFence && state == State.ACTIVE) {
329          // If active is gracefully going back to standby mode, remove
330          // our permanent znode so no one fences us.
331          tryDeleteOwnBreadCrumbNode();
332        }
333        reset();
334        wantToBeInElection = false;
335      }
336    
337      /**
338       * Exception thrown when there is no active leader
339       */
340      public static class ActiveNotFoundException extends Exception {
341        private static final long serialVersionUID = 3505396722342846462L;
342      }
343    
344      /**
345       * get data set by the active leader
346       * 
347       * @return data set by the active instance
348       * @throws ActiveNotFoundException
349       *           when there is no active leader
350       * @throws KeeperException
351       *           other zookeeper operation errors
352       * @throws InterruptedException
353       * @throws IOException
354       *           when ZooKeeper connection could not be established
355       */
356      public synchronized byte[] getActiveData() throws ActiveNotFoundException,
357          KeeperException, InterruptedException, IOException {
358        try {
359          if (zkClient == null) {
360            createConnection();
361          }
362          Stat stat = new Stat();
363          return zkClient.getData(zkLockFilePath, false, stat);
364        } catch(KeeperException e) {
365          Code code = e.code();
366          if (isNodeDoesNotExist(code)) {
367            // handle the commonly expected cases that make sense for us
368            throw new ActiveNotFoundException();
369          } else {
370            throw e;
371          }
372        }
373      }
374    
375      /**
376       * interface implementation of Zookeeper callback for create
377       */
378      @Override
379      public synchronized void processResult(int rc, String path, Object ctx,
380          String name) {
381        if (isStaleClient(ctx)) return;
382        LOG.debug("CreateNode result: " + rc + " for path: " + path
383            + " connectionState: " + zkConnectionState);
384    
385        Code code = Code.get(rc);
386        if (isSuccess(code)) {
387          // we successfully created the znode. we are the leader. start monitoring
388          becomeActive();
389          monitorActiveStatus();
390          return;
391        }
392    
393        if (isNodeExists(code)) {
394          if (createRetryCount == 0) {
395            // znode exists and we did not retry the operation. so a different
396            // instance has created it. become standby and monitor lock.
397            becomeStandby();
398          }
399          // if we had retried then the znode could have been created by our first
400          // attempt to the server (that we lost) and this node exists response is
401          // for the second attempt. verify this case via ephemeral node owner. this
402          // will happen on the callback for monitoring the lock.
403          monitorActiveStatus();
404          return;
405        }
406    
407        String errorMessage = "Received create error from Zookeeper. code:"
408            + code.toString() + " for path " + path;
409        LOG.debug(errorMessage);
410    
411        if (shouldRetry(code)) {
412          if (createRetryCount < NUM_RETRIES) {
413            LOG.debug("Retrying createNode createRetryCount: " + createRetryCount);
414            ++createRetryCount;
415            createLockNodeAsync();
416            return;
417          }
418          errorMessage = errorMessage
419              + ". Not retrying further znode create connection errors.";
420        } else if (isSessionExpired(code)) {
421          // This isn't fatal - the client Watcher will re-join the election
422          LOG.warn("Lock acquisition failed because session was lost");
423          return;
424        }
425    
426        fatalError(errorMessage);
427      }
428    
429      /**
430       * interface implementation of Zookeeper callback for monitor (exists)
431       */
432      @Override
433      public synchronized void processResult(int rc, String path, Object ctx,
434          Stat stat) {
435        if (isStaleClient(ctx)) return;
436        LOG.debug("StatNode result: " + rc + " for path: " + path
437            + " connectionState: " + zkConnectionState);
438    
439        Code code = Code.get(rc);
440        if (isSuccess(code)) {
441          // the following owner check completes verification in case the lock znode
442          // creation was retried
443          if (stat.getEphemeralOwner() == zkClient.getSessionId()) {
444            // we own the lock znode. so we are the leader
445            becomeActive();
446          } else {
447            // we dont own the lock znode. so we are a standby.
448            becomeStandby();
449          }
450          // the watch set by us will notify about changes
451          return;
452        }
453    
454        if (isNodeDoesNotExist(code)) {
455          // the lock znode disappeared before we started monitoring it
456          enterNeutralMode();
457          joinElectionInternal();
458          return;
459        }
460    
461        String errorMessage = "Received stat error from Zookeeper. code:"
462            + code.toString();
463        LOG.debug(errorMessage);
464    
465        if (shouldRetry(code)) {
466          if (statRetryCount < NUM_RETRIES) {
467            ++statRetryCount;
468            monitorLockNodeAsync();
469            return;
470          }
471          errorMessage = errorMessage
472              + ". Not retrying further znode monitoring connection errors.";
473        }
474    
475        fatalError(errorMessage);
476      }
477    
478      /**
479       * interface implementation of Zookeeper watch events (connection and node)
480       */
481      synchronized void processWatchEvent(ZooKeeper zk, WatchedEvent event) {
482        Event.EventType eventType = event.getType();
483        if (isStaleClient(zk)) return;
484        LOG.debug("Watcher event type: " + eventType + " with state:"
485            + event.getState() + " for path:" + event.getPath()
486            + " connectionState: " + zkConnectionState);
487    
488        if (eventType == Event.EventType.None) {
489          // the connection state has changed
490          switch (event.getState()) {
491          case SyncConnected:
492            LOG.info("Session connected.");
493            // if the listener was asked to move to safe state then it needs to
494            // be undone
495            ConnectionState prevConnectionState = zkConnectionState;
496            zkConnectionState = ConnectionState.CONNECTED;
497            if (prevConnectionState == ConnectionState.DISCONNECTED) {
498              monitorActiveStatus();
499            }
500            break;
501          case Disconnected:
502            LOG.info("Session disconnected. Entering neutral mode...");
503    
504            // ask the app to move to safe state because zookeeper connection
505            // is not active and we dont know our state
506            zkConnectionState = ConnectionState.DISCONNECTED;
507            enterNeutralMode();
508            break;
509          case Expired:
510            // the connection got terminated because of session timeout
511            // call listener to reconnect
512            LOG.info("Session expired. Entering neutral mode and rejoining...");
513            enterNeutralMode();
514            reJoinElection();
515            break;
516          default:
517            fatalError("Unexpected Zookeeper watch event state: "
518                + event.getState());
519            break;
520          }
521    
522          return;
523        }
524    
525        // a watch on lock path in zookeeper has fired. so something has changed on
526        // the lock. ideally we should check that the path is the same as the lock
527        // path but trusting zookeeper for now
528        String path = event.getPath();
529        if (path != null) {
530          switch (eventType) {
531          case NodeDeleted:
532            if (state == State.ACTIVE) {
533              enterNeutralMode();
534            }
535            joinElectionInternal();
536            break;
537          case NodeDataChanged:
538            monitorActiveStatus();
539            break;
540          default:
541            LOG.debug("Unexpected node event: " + eventType + " for path: " + path);
542            monitorActiveStatus();
543          }
544    
545          return;
546        }
547    
548        // some unexpected error has occurred
549        fatalError("Unexpected watch error from Zookeeper");
550      }
551    
552      /**
553       * Get a new zookeeper client instance. protected so that test class can
554       * inherit and pass in a mock object for zookeeper
555       * 
556       * @return new zookeeper client instance
557       * @throws IOException
558       */
559      protected synchronized ZooKeeper getNewZooKeeper() throws IOException {
560        ZooKeeper zk = new ZooKeeper(zkHostPort, zkSessionTimeout, null);
561        zk.register(new WatcherWithClientRef(zk));
562        return zk;
563      }
564    
565      private void fatalError(String errorMessage) {
566        reset();
567        appClient.notifyFatalError(errorMessage);
568      }
569    
570      private void monitorActiveStatus() {
571        LOG.debug("Monitoring active leader");
572        statRetryCount = 0;
573        monitorLockNodeAsync();
574      }
575    
576      private void joinElectionInternal() {
577        if (zkClient == null) {
578          if (!reEstablishSession()) {
579            fatalError("Failed to reEstablish connection with ZooKeeper");
580            return;
581          }
582        }
583    
584        createRetryCount = 0;
585        wantToBeInElection = true;
586        createLockNodeAsync();
587      }
588    
589      private void reJoinElection() {
590        LOG.info("Trying to re-establish ZK session");
591        
592        // Some of the test cases rely on expiring the ZK sessions and
593        // ensuring that the other node takes over. But, there's a race
594        // where the original lease holder could reconnect faster than the other
595        // thread manages to take the lock itself. This lock allows the
596        // tests to block the reconnection. It's a shame that this leaked
597        // into non-test code, but the lock is only acquired here so will never
598        // be contended.
599        sessionReestablishLockForTests.lock();
600        try {
601          terminateConnection();
602          joinElectionInternal();
603        } finally {
604          sessionReestablishLockForTests.unlock();
605        }
606      }
607      
608      @VisibleForTesting
609      void preventSessionReestablishmentForTests() {
610        sessionReestablishLockForTests.lock();
611      }
612      
613      @VisibleForTesting
614      void allowSessionReestablishmentForTests() {
615        sessionReestablishLockForTests.unlock();
616      }
617      
618      @VisibleForTesting
619      long getZKSessionIdForTests() {
620        return zkClient.getSessionId();
621      }
622      
623      @VisibleForTesting
624      synchronized State getStateForTests() {
625        return state;
626      }
627    
628      private boolean reEstablishSession() {
629        int connectionRetryCount = 0;
630        boolean success = false;
631        while(!success && connectionRetryCount < NUM_RETRIES) {
632          LOG.debug("Establishing zookeeper connection");
633          try {
634            createConnection();
635            success = true;
636          } catch(IOException e) {
637            LOG.warn(e);
638            try {
639              Thread.sleep(5000);
640            } catch(InterruptedException e1) {
641              LOG.warn(e1);
642            }
643          }
644          ++connectionRetryCount;
645        }
646        return success;
647      }
648    
649      private void createConnection() throws IOException {
650        zkClient = getNewZooKeeper();
651      }
652      
653      private void terminateConnection() {
654        if (zkClient == null) {
655          return;
656        }
657        LOG.debug("Terminating ZK connection");
658        ZooKeeper tempZk = zkClient;
659        zkClient = null;
660        try {
661          tempZk.close();
662        } catch(InterruptedException e) {
663          LOG.warn(e);
664        }
665        zkConnectionState = ConnectionState.TERMINATED;
666      }
667    
668      private void reset() {
669        state = State.INIT;
670        terminateConnection();
671      }
672    
673      private void becomeActive() {
674        assert wantToBeInElection;
675        if (state != State.ACTIVE) {
676          try {
677            Stat oldBreadcrumbStat = fenceOldActive();
678            writeBreadCrumbNode(oldBreadcrumbStat);
679          } catch (Exception e) {
680            LOG.warn("Exception handling the winning of election", e);
681            reJoinElection();
682            return;
683          }
684          LOG.debug("Becoming active");
685          state = State.ACTIVE;
686          appClient.becomeActive();
687        }
688      }
689    
690      /**
691       * Write the "ActiveBreadCrumb" node, indicating that this node may need
692       * to be fenced on failover.
693       * @param oldBreadcrumbStat 
694       */
695      private void writeBreadCrumbNode(Stat oldBreadcrumbStat)
696          throws KeeperException, InterruptedException {
697        LOG.info("Writing znode " + zkBreadCrumbPath +
698            " to indicate that the local node is the most recent active...");
699        if (oldBreadcrumbStat == null) {
700          // No previous active, just create the node
701          createWithRetries(zkBreadCrumbPath, appData, zkAcl,
702            CreateMode.PERSISTENT);
703        } else {
704          // There was a previous active, update the node
705          setDataWithRetries(zkBreadCrumbPath, appData, oldBreadcrumbStat.getVersion());
706        }
707      }
708      
709      /**
710       * Try to delete the "ActiveBreadCrumb" node when gracefully giving up
711       * active status.
712       * If this fails, it will simply warn, since the graceful release behavior
713       * is only an optimization.
714       */
715      private void tryDeleteOwnBreadCrumbNode() {
716        assert state == State.ACTIVE;
717        LOG.info("Deleting bread-crumb of active node...");
718        
719        // Sanity check the data. This shouldn't be strictly necessary,
720        // but better to play it safe.
721        Stat stat = new Stat();
722        byte[] data = null;
723        try {
724          data = zkClient.getData(zkBreadCrumbPath, false, stat);
725    
726          if (!Arrays.equals(data, appData)) {
727            throw new IllegalStateException(
728                "We thought we were active, but in fact " +
729                "the active znode had the wrong data: " +
730                StringUtils.byteToHexString(data) + " (stat=" + stat + ")");
731          }
732          
733          deleteWithRetries(zkBreadCrumbPath, stat.getVersion());
734        } catch (Exception e) {
735          LOG.warn("Unable to delete our own bread-crumb of being active at " +
736              zkBreadCrumbPath + ": " + e.getLocalizedMessage() + ". " +
737              "Expecting to be fenced by the next active.");
738        }
739      }
740    
741      /**
742       * If there is a breadcrumb node indicating that another node may need
743       * fencing, try to fence that node.
744       * @return the Stat of the breadcrumb node that was read, or null
745       * if no breadcrumb node existed
746       */
747      private Stat fenceOldActive() throws InterruptedException, KeeperException {
748        final Stat stat = new Stat();
749        byte[] data;
750        LOG.info("Checking for any old active which needs to be fenced...");
751        try {
752          data = zkDoWithRetries(new ZKAction<byte[]>() {
753            @Override
754            public byte[] run() throws KeeperException, InterruptedException {
755              return zkClient.getData(zkBreadCrumbPath, false, stat);
756            }
757          });
758        } catch (KeeperException ke) {
759          if (isNodeDoesNotExist(ke.code())) {
760            LOG.info("No old node to fence");
761            return null;
762          }
763          
764          // If we failed to read for any other reason, then likely we lost
765          // our session, or we don't have permissions, etc. In any case,
766          // we probably shouldn't become active, and failing the whole
767          // thing is the best bet.
768          throw ke;
769        }
770    
771        LOG.info("Old node exists: " + StringUtils.byteToHexString(data));
772        if (Arrays.equals(data, appData)) {
773          LOG.info("But old node has our own data, so don't need to fence it.");
774        } else {
775          appClient.fenceOldActive(data);
776        }
777        return stat;
778      }
779    
780      private void becomeStandby() {
781        if (state != State.STANDBY) {
782          LOG.debug("Becoming standby");
783          state = State.STANDBY;
784          appClient.becomeStandby();
785        }
786      }
787    
788      private void enterNeutralMode() {
789        if (state != State.NEUTRAL) {
790          LOG.debug("Entering neutral mode");
791          state = State.NEUTRAL;
792          appClient.enterNeutralMode();
793        }
794      }
795    
796      private void createLockNodeAsync() {
797        zkClient.create(zkLockFilePath, appData, zkAcl, CreateMode.EPHEMERAL,
798            this, zkClient);
799      }
800    
801      private void monitorLockNodeAsync() {
802        zkClient.exists(zkLockFilePath, 
803            new WatcherWithClientRef(zkClient), this,
804            zkClient);
805      }
806    
807      private String createWithRetries(final String path, final byte[] data,
808          final List<ACL> acl, final CreateMode mode)
809          throws InterruptedException, KeeperException {
810        return zkDoWithRetries(new ZKAction<String>() {
811          public String run() throws KeeperException, InterruptedException {
812            return zkClient.create(path, data, acl, mode);
813          }
814        });
815      }
816    
817      private Stat setDataWithRetries(final String path, final byte[] data,
818          final int version) throws InterruptedException, KeeperException {
819        return zkDoWithRetries(new ZKAction<Stat>() {
820          public Stat run() throws KeeperException, InterruptedException {
821            return zkClient.setData(path, data, version);
822          }
823        });
824      }
825      
826      private void deleteWithRetries(final String path, final int version)
827          throws KeeperException, InterruptedException {
828        zkDoWithRetries(new ZKAction<Void>() {
829          public Void run() throws KeeperException, InterruptedException {
830            zkClient.delete(path, version);
831            return null;
832          }
833        });
834      }
835    
836      private static <T> T zkDoWithRetries(ZKAction<T> action)
837          throws KeeperException, InterruptedException {
838        int retry = 0;
839        while (true) {
840          try {
841            return action.run();
842          } catch (KeeperException ke) {
843            if (shouldRetry(ke.code()) && ++retry < NUM_RETRIES) {
844              continue;
845            }
846            throw ke;
847          }
848        }
849      }
850    
851      private interface ZKAction<T> {
852        T run() throws KeeperException, InterruptedException; 
853      }
854      
855      /**
856       * The callbacks and watchers pass a reference to the ZK client
857       * which made the original call. We don't want to take action
858       * based on any callbacks from prior clients after we quit
859       * the election.
860       * @param ctx the ZK client passed into the watcher
861       * @return true if it matches the current client
862       */
863      private synchronized boolean isStaleClient(Object ctx) {
864        Preconditions.checkNotNull(ctx);
865        if (zkClient != (ZooKeeper)ctx) {
866          LOG.warn("Ignoring stale result from old client with sessionId " +
867              String.format("0x%08x", ((ZooKeeper)ctx).getSessionId()));
868          return true;
869        }
870        return false;
871      }
872    
873      /**
874       * Watcher implementation which keeps a reference around to the
875       * original ZK connection, and passes it back along with any
876       * events.
877       */
878      private final class WatcherWithClientRef implements Watcher {
879        private final ZooKeeper zk;
880    
881        private WatcherWithClientRef(ZooKeeper zk) {
882          this.zk = zk;
883        }
884    
885        @Override
886        public void process(WatchedEvent event) {
887          ActiveStandbyElector.this.processWatchEvent(
888              zk, event);
889        }
890      }
891    
892      private static boolean isSuccess(Code code) {
893        return (code == Code.OK);
894      }
895    
896      private static boolean isNodeExists(Code code) {
897        return (code == Code.NODEEXISTS);
898      }
899    
900      private static boolean isNodeDoesNotExist(Code code) {
901        return (code == Code.NONODE);
902      }
903      
904      private static boolean isSessionExpired(Code code) {
905        return (code == Code.SESSIONEXPIRED);
906      }
907    
908      private static boolean shouldRetry(Code code) {
909        switch (code) {
910        case CONNECTIONLOSS:
911        case OPERATIONTIMEOUT:
912          return true;
913        }
914        return false;
915      }
916    
917    }