001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.net;
019
020 import java.util.ArrayList;
021 import java.util.Collection;
022 import java.util.Random;
023 import java.util.concurrent.locks.ReadWriteLock;
024 import java.util.concurrent.locks.ReentrantReadWriteLock;
025
026 import org.apache.commons.logging.Log;
027 import org.apache.commons.logging.LogFactory;
028 import org.apache.hadoop.classification.InterfaceAudience;
029 import org.apache.hadoop.classification.InterfaceStability;
030
031 /** The class represents a cluster of computer with a tree hierarchical
032 * network topology.
033 * For example, a cluster may be consists of many data centers filled
034 * with racks of computers.
035 * In a network topology, leaves represent data nodes (computers) and inner
036 * nodes represent switches/routers that manage traffic in/out of data centers
037 * or racks.
038 *
039 */
040 @InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"})
041 @InterfaceStability.Unstable
042 public class NetworkTopology {
043 public final static String DEFAULT_RACK = "/default-rack";
044 public final static int DEFAULT_HOST_LEVEL = 2;
045 public static final Log LOG =
046 LogFactory.getLog(NetworkTopology.class);
047
048 public static class InvalidTopologyException extends RuntimeException {
049 private static final long serialVersionUID = 1L;
050 public InvalidTopologyException(String msg) {
051 super(msg);
052 }
053 }
054
055 /** InnerNode represents a switch/router of a data center or rack.
056 * Different from a leaf node, it has non-null children.
057 */
058 private class InnerNode extends NodeBase {
059 private ArrayList<Node> children=new ArrayList<Node>();
060 private int numOfLeaves;
061
062 /** Construct an InnerNode from a path-like string */
063 InnerNode(String path) {
064 super(path);
065 }
066
067 /** Construct an InnerNode from its name and its network location */
068 InnerNode(String name, String location) {
069 super(name, location);
070 }
071
072 /** Construct an InnerNode
073 * from its name, its network location, its parent, and its level */
074 InnerNode(String name, String location, InnerNode parent, int level) {
075 super(name, location, parent, level);
076 }
077
078 /** @return its children */
079 Collection<Node> getChildren() {return children;}
080
081 /** @return the number of children this node has */
082 int getNumOfChildren() {
083 return children.size();
084 }
085
086 /** Judge if this node represents a rack
087 * @return true if it has no child or its children are not InnerNodes
088 */
089 boolean isRack() {
090 if (children.isEmpty()) {
091 return true;
092 }
093
094 Node firstChild = children.get(0);
095 if (firstChild instanceof InnerNode) {
096 return false;
097 }
098
099 return true;
100 }
101
102 /** Judge if this node is an ancestor of node <i>n</i>
103 *
104 * @param n a node
105 * @return true if this node is an ancestor of <i>n</i>
106 */
107 boolean isAncestor(Node n) {
108 return getPath(this).equals(NodeBase.PATH_SEPARATOR_STR) ||
109 (n.getNetworkLocation()+NodeBase.PATH_SEPARATOR_STR).
110 startsWith(getPath(this)+NodeBase.PATH_SEPARATOR_STR);
111 }
112
113 /** Judge if this node is the parent of node <i>n</i>
114 *
115 * @param n a node
116 * @return true if this node is the parent of <i>n</i>
117 */
118 boolean isParent(Node n) {
119 return n.getNetworkLocation().equals(getPath(this));
120 }
121
122 /* Return a child name of this node who is an ancestor of node <i>n</i> */
123 private String getNextAncestorName(Node n) {
124 if (!isAncestor(n)) {
125 throw new IllegalArgumentException(
126 this + "is not an ancestor of " + n);
127 }
128 String name = n.getNetworkLocation().substring(getPath(this).length());
129 if (name.charAt(0) == PATH_SEPARATOR) {
130 name = name.substring(1);
131 }
132 int index=name.indexOf(PATH_SEPARATOR);
133 if (index !=-1)
134 name = name.substring(0, index);
135 return name;
136 }
137
138 /** Add node <i>n</i> to the subtree of this node
139 * @param n node to be added
140 * @return true if the node is added; false otherwise
141 */
142 boolean add(Node n) {
143 if (!isAncestor(n))
144 throw new IllegalArgumentException(n.getName()+", which is located at "
145 +n.getNetworkLocation()+", is not a decendent of "
146 +getPath(this));
147 if (isParent(n)) {
148 // this node is the parent of n; add n directly
149 n.setParent(this);
150 n.setLevel(this.level+1);
151 for(int i=0; i<children.size(); i++) {
152 if (children.get(i).getName().equals(n.getName())) {
153 children.set(i, n);
154 return false;
155 }
156 }
157 children.add(n);
158 numOfLeaves++;
159 return true;
160 } else {
161 // find the next ancestor node
162 String parentName = getNextAncestorName(n);
163 InnerNode parentNode = null;
164 for(int i=0; i<children.size(); i++) {
165 if (children.get(i).getName().equals(parentName)) {
166 parentNode = (InnerNode)children.get(i);
167 break;
168 }
169 }
170 if (parentNode == null) {
171 // create a new InnerNode
172 parentNode = new InnerNode(parentName, getPath(this),
173 this, this.getLevel()+1);
174 children.add(parentNode);
175 }
176 // add n to the subtree of the next ancestor node
177 if (parentNode.add(n)) {
178 numOfLeaves++;
179 return true;
180 } else {
181 return false;
182 }
183 }
184 }
185
186 /** Remove node <i>n</i> from the subtree of this node
187 * @param n node to be deleted
188 * @return true if the node is deleted; false otherwise
189 */
190 boolean remove(Node n) {
191 String parent = n.getNetworkLocation();
192 String currentPath = getPath(this);
193 if (!isAncestor(n))
194 throw new IllegalArgumentException(n.getName()
195 +", which is located at "
196 +parent+", is not a descendent of "+currentPath);
197 if (isParent(n)) {
198 // this node is the parent of n; remove n directly
199 for(int i=0; i<children.size(); i++) {
200 if (children.get(i).getName().equals(n.getName())) {
201 children.remove(i);
202 numOfLeaves--;
203 n.setParent(null);
204 return true;
205 }
206 }
207 return false;
208 } else {
209 // find the next ancestor node: the parent node
210 String parentName = getNextAncestorName(n);
211 InnerNode parentNode = null;
212 int i;
213 for(i=0; i<children.size(); i++) {
214 if (children.get(i).getName().equals(parentName)) {
215 parentNode = (InnerNode)children.get(i);
216 break;
217 }
218 }
219 if (parentNode==null) {
220 return false;
221 }
222 // remove n from the parent node
223 boolean isRemoved = parentNode.remove(n);
224 // if the parent node has no children, remove the parent node too
225 if (isRemoved) {
226 if (parentNode.getNumOfChildren() == 0) {
227 children.remove(i);
228 }
229 numOfLeaves--;
230 }
231 return isRemoved;
232 }
233 } // end of remove
234
235 /** Given a node's string representation, return a reference to the node
236 * @param loc string location of the form /rack/node
237 * @return null if the node is not found or the childnode is there but
238 * not an instance of {@link InnerNode}
239 */
240 private Node getLoc(String loc) {
241 if (loc == null || loc.length() == 0) return this;
242
243 String[] path = loc.split(PATH_SEPARATOR_STR, 2);
244 Node childnode = null;
245 for(int i=0; i<children.size(); i++) {
246 if (children.get(i).getName().equals(path[0])) {
247 childnode = children.get(i);
248 }
249 }
250 if (childnode == null) return null; // non-existing node
251 if (path.length == 1) return childnode;
252 if (childnode instanceof InnerNode) {
253 return ((InnerNode)childnode).getLoc(path[1]);
254 } else {
255 return null;
256 }
257 }
258
259 /** get <i>leafIndex</i> leaf of this subtree
260 * if it is not in the <i>excludedNode</i>
261 *
262 * @param leafIndex an indexed leaf of the node
263 * @param excludedNode an excluded node (can be null)
264 * @return
265 */
266 private Node getLeaf(int leafIndex, Node excludedNode) {
267 int count=0;
268 // check if the excluded node a leaf
269 boolean isLeaf =
270 excludedNode == null || !(excludedNode instanceof InnerNode);
271 // calculate the total number of excluded leaf nodes
272 int numOfExcludedLeaves =
273 isLeaf ? 1 : ((InnerNode)excludedNode).getNumOfLeaves();
274 if (isRack()) { // children are leaves
275 if (isLeaf) { // excluded node is a leaf node
276 int excludedIndex = children.indexOf(excludedNode);
277 if (excludedIndex != -1 && leafIndex >= 0) {
278 // excluded node is one of the children so adjust the leaf index
279 leafIndex = leafIndex>=excludedIndex ? leafIndex+1 : leafIndex;
280 }
281 }
282 // range check
283 if (leafIndex<0 || leafIndex>=this.getNumOfChildren()) {
284 return null;
285 }
286 return children.get(leafIndex);
287 } else {
288 for(int i=0; i<children.size(); i++) {
289 InnerNode child = (InnerNode)children.get(i);
290 if (excludedNode == null || excludedNode != child) {
291 // not the excludedNode
292 int numOfLeaves = child.getNumOfLeaves();
293 if (excludedNode != null && child.isAncestor(excludedNode)) {
294 numOfLeaves -= numOfExcludedLeaves;
295 }
296 if (count+numOfLeaves > leafIndex) {
297 // the leaf is in the child subtree
298 return child.getLeaf(leafIndex-count, excludedNode);
299 } else {
300 // go to the next child
301 count = count+numOfLeaves;
302 }
303 } else { // it is the excluededNode
304 // skip it and set the excludedNode to be null
305 excludedNode = null;
306 }
307 }
308 return null;
309 }
310 }
311
312 int getNumOfLeaves() {
313 return numOfLeaves;
314 }
315 } // end of InnerNode
316
317 /**
318 * the root cluster map
319 */
320 InnerNode clusterMap = new InnerNode(InnerNode.ROOT);
321 /** Depth of all leaf nodes */
322 private int depthOfAllLeaves = -1;
323 /** rack counter */
324 private int numOfRacks = 0;
325 /** the lock used to manage access */
326 private ReadWriteLock netlock;
327
328 public NetworkTopology() {
329 netlock = new ReentrantReadWriteLock();
330 }
331
332 /** Add a leaf node
333 * Update node counter & rack counter if necessary
334 * @param node node to be added; can be null
335 * @exception IllegalArgumentException if add a node to a leave
336 or node to be added is not a leaf
337 */
338 public void add(Node node) {
339 if (node==null) return;
340 String oldTopoStr = this.toString();
341 if( node instanceof InnerNode ) {
342 throw new IllegalArgumentException(
343 "Not allow to add an inner node: "+NodeBase.getPath(node));
344 }
345 netlock.writeLock().lock();
346 try {
347 Node rack = getNode(node.getNetworkLocation());
348 if (rack != null && !(rack instanceof InnerNode)) {
349 throw new IllegalArgumentException("Unexpected data node "
350 + node.toString()
351 + " at an illegal network location");
352 }
353 if (clusterMap.add(node)) {
354 LOG.info("Adding a new node: "+NodeBase.getPath(node));
355 if (rack == null) {
356 numOfRacks++;
357 }
358 if (!(node instanceof InnerNode)) {
359 if (depthOfAllLeaves == -1) {
360 depthOfAllLeaves = node.getLevel();
361 } else {
362 if (depthOfAllLeaves != node.getLevel()) {
363 LOG.error("Error: can't add leaf node at depth " +
364 node.getLevel() + " to topology:\n" + oldTopoStr);
365 throw new InvalidTopologyException("Invalid network topology. " +
366 "You cannot have a rack and a non-rack node at the same " +
367 "level of the network topology.");
368 }
369 }
370 }
371 }
372 if(LOG.isDebugEnabled()) {
373 LOG.debug("NetworkTopology became:\n" + this.toString());
374 }
375 } finally {
376 netlock.writeLock().unlock();
377 }
378 }
379
380 /** Remove a node
381 * Update node counter and rack counter if necessary
382 * @param node node to be removed; can be null
383 */
384 public void remove(Node node) {
385 if (node==null) return;
386 if( node instanceof InnerNode ) {
387 throw new IllegalArgumentException(
388 "Not allow to remove an inner node: "+NodeBase.getPath(node));
389 }
390 LOG.info("Removing a node: "+NodeBase.getPath(node));
391 netlock.writeLock().lock();
392 try {
393 if (clusterMap.remove(node)) {
394 InnerNode rack = (InnerNode)getNode(node.getNetworkLocation());
395 if (rack == null) {
396 numOfRacks--;
397 }
398 }
399 if(LOG.isDebugEnabled()) {
400 LOG.debug("NetworkTopology became:\n" + this.toString());
401 }
402 } finally {
403 netlock.writeLock().unlock();
404 }
405 }
406
407 /** Check if the tree contains node <i>node</i>
408 *
409 * @param node a node
410 * @return true if <i>node</i> is already in the tree; false otherwise
411 */
412 public boolean contains(Node node) {
413 if (node == null) return false;
414 netlock.readLock().lock();
415 try {
416 Node parent = node.getParent();
417 for (int level = node.getLevel(); parent != null && level > 0;
418 parent = parent.getParent(), level--) {
419 if (parent == clusterMap) {
420 return true;
421 }
422 }
423 } finally {
424 netlock.readLock().unlock();
425 }
426 return false;
427 }
428
429 /** Given a string representation of a node, return its reference
430 *
431 * @param loc
432 * a path-like string representation of a node
433 * @return a reference to the node; null if the node is not in the tree
434 */
435 public Node getNode(String loc) {
436 netlock.readLock().lock();
437 try {
438 loc = NodeBase.normalize(loc);
439 if (!NodeBase.ROOT.equals(loc))
440 loc = loc.substring(1);
441 return clusterMap.getLoc(loc);
442 } finally {
443 netlock.readLock().unlock();
444 }
445 }
446
447 /** @return the total number of racks */
448 public int getNumOfRacks() {
449 netlock.readLock().lock();
450 try {
451 return numOfRacks;
452 } finally {
453 netlock.readLock().unlock();
454 }
455 }
456
457 /** @return the total number of leaf nodes */
458 public int getNumOfLeaves() {
459 netlock.readLock().lock();
460 try {
461 return clusterMap.getNumOfLeaves();
462 } finally {
463 netlock.readLock().unlock();
464 }
465 }
466
467 /** Return the distance between two nodes
468 * It is assumed that the distance from one node to its parent is 1
469 * The distance between two nodes is calculated by summing up their distances
470 * to their closest common ancestor.
471 * @param node1 one node
472 * @param node2 another node
473 * @return the distance between node1 and node2 which is zero if they are the same
474 * or {@link Integer#MAX_VALUE} if node1 or node2 do not belong to the cluster
475 */
476 public int getDistance(Node node1, Node node2) {
477 if (node1 == node2) {
478 return 0;
479 }
480 Node n1=node1, n2=node2;
481 int dis = 0;
482 netlock.readLock().lock();
483 try {
484 int level1=node1.getLevel(), level2=node2.getLevel();
485 while(n1!=null && level1>level2) {
486 n1 = n1.getParent();
487 level1--;
488 dis++;
489 }
490 while(n2!=null && level2>level1) {
491 n2 = n2.getParent();
492 level2--;
493 dis++;
494 }
495 while(n1!=null && n2!=null && n1.getParent()!=n2.getParent()) {
496 n1=n1.getParent();
497 n2=n2.getParent();
498 dis+=2;
499 }
500 } finally {
501 netlock.readLock().unlock();
502 }
503 if (n1==null) {
504 LOG.warn("The cluster does not contain node: "+NodeBase.getPath(node1));
505 return Integer.MAX_VALUE;
506 }
507 if (n2==null) {
508 LOG.warn("The cluster does not contain node: "+NodeBase.getPath(node2));
509 return Integer.MAX_VALUE;
510 }
511 return dis+2;
512 }
513
514 /** Check if two nodes are on the same rack
515 * @param node1 one node (can be null)
516 * @param node2 another node (can be null)
517 * @return true if node1 and node2 are on the same rack; false otherwise
518 * @exception IllegalArgumentException when either node1 or node2 is null, or
519 * node1 or node2 do not belong to the cluster
520 */
521 public boolean isOnSameRack( Node node1, Node node2) {
522 if (node1 == null || node2 == null) {
523 return false;
524 }
525
526 netlock.readLock().lock();
527 try {
528 return node1.getParent()==node2.getParent();
529 } finally {
530 netlock.readLock().unlock();
531 }
532 }
533
534 final private static Random r = new Random();
535 /** randomly choose one node from <i>scope</i>
536 * if scope starts with ~, choose one from the all nodes except for the
537 * ones in <i>scope</i>; otherwise, choose one from <i>scope</i>
538 * @param scope range of nodes from which a node will be chosen
539 * @return the chosen node
540 */
541 public Node chooseRandom(String scope) {
542 netlock.readLock().lock();
543 try {
544 if (scope.startsWith("~")) {
545 return chooseRandom(NodeBase.ROOT, scope.substring(1));
546 } else {
547 return chooseRandom(scope, null);
548 }
549 } finally {
550 netlock.readLock().unlock();
551 }
552 }
553
554 private Node chooseRandom(String scope, String excludedScope){
555 if (excludedScope != null) {
556 if (scope.startsWith(excludedScope)) {
557 return null;
558 }
559 if (!excludedScope.startsWith(scope)) {
560 excludedScope = null;
561 }
562 }
563 Node node = getNode(scope);
564 if (!(node instanceof InnerNode)) {
565 return node;
566 }
567 InnerNode innerNode = (InnerNode)node;
568 int numOfDatanodes = innerNode.getNumOfLeaves();
569 if (excludedScope == null) {
570 node = null;
571 } else {
572 node = getNode(excludedScope);
573 if (!(node instanceof InnerNode)) {
574 numOfDatanodes -= 1;
575 } else {
576 numOfDatanodes -= ((InnerNode)node).getNumOfLeaves();
577 }
578 }
579 int leaveIndex = r.nextInt(numOfDatanodes);
580 return innerNode.getLeaf(leaveIndex, node);
581 }
582
583 /** return the number of leaves in <i>scope</i> but not in <i>excludedNodes</i>
584 * if scope starts with ~, return the number of nodes that are not
585 * in <i>scope</i> and <i>excludedNodes</i>;
586 * @param scope a path string that may start with ~
587 * @param excludedNodes a list of nodes
588 * @return number of available nodes
589 */
590 public int countNumOfAvailableNodes(String scope,
591 Collection<Node> excludedNodes) {
592 boolean isExcluded=false;
593 if (scope.startsWith("~")) {
594 isExcluded=true;
595 scope=scope.substring(1);
596 }
597 scope = NodeBase.normalize(scope);
598 int count=0; // the number of nodes in both scope & excludedNodes
599 netlock.readLock().lock();
600 try {
601 for(Node node:excludedNodes) {
602 if ((NodeBase.getPath(node)+NodeBase.PATH_SEPARATOR_STR).
603 startsWith(scope+NodeBase.PATH_SEPARATOR_STR)) {
604 count++;
605 }
606 }
607 Node n=getNode(scope);
608 int scopeNodeCount=1;
609 if (n instanceof InnerNode) {
610 scopeNodeCount=((InnerNode)n).getNumOfLeaves();
611 }
612 if (isExcluded) {
613 return clusterMap.getNumOfLeaves()-
614 scopeNodeCount-excludedNodes.size()+count;
615 } else {
616 return scopeNodeCount-count;
617 }
618 } finally {
619 netlock.readLock().unlock();
620 }
621 }
622
623 /** convert a network tree to a string */
624 public String toString() {
625 // print the number of racks
626 StringBuilder tree = new StringBuilder();
627 tree.append("Number of racks: ");
628 tree.append(numOfRacks);
629 tree.append("\n");
630 // print the number of leaves
631 int numOfLeaves = getNumOfLeaves();
632 tree.append("Expected number of leaves:");
633 tree.append(numOfLeaves);
634 tree.append("\n");
635 // print nodes
636 for(int i=0; i<numOfLeaves; i++) {
637 tree.append(NodeBase.getPath(clusterMap.getLeaf(i, null)));
638 tree.append("\n");
639 }
640 return tree.toString();
641 }
642
643 /* swap two array items */
644 static private void swap(Node[] nodes, int i, int j) {
645 Node tempNode;
646 tempNode = nodes[j];
647 nodes[j] = nodes[i];
648 nodes[i] = tempNode;
649
650 }
651
652 /** Sort nodes array by their distances to <i>reader</i>
653 * It linearly scans the array, if a local node is found, swap it with
654 * the first element of the array.
655 * If a local rack node is found, swap it with the first element following
656 * the local node.
657 * If neither local node or local rack node is found, put a random replica
658 * location at position 0.
659 * It leaves the rest nodes untouched.
660 * @param reader the node that wishes to read a block from one of the nodes
661 * @param nodes the list of nodes containing data for the reader
662 */
663 public void pseudoSortByDistance( Node reader, Node[] nodes ) {
664 int tempIndex = 0;
665 int localRackNode = -1;
666 if (reader != null ) {
667 //scan the array to find the local node & local rack node
668 for(int i=0; i<nodes.length; i++) {
669 if(tempIndex == 0 && reader == nodes[i]) { //local node
670 //swap the local node and the node at position 0
671 if( i != 0 ) {
672 swap(nodes, tempIndex, i);
673 }
674 tempIndex=1;
675 if(localRackNode != -1 ) {
676 if(localRackNode == 0) {
677 localRackNode = i;
678 }
679 break;
680 }
681 } else if(localRackNode == -1 && isOnSameRack(reader, nodes[i])) {
682 //local rack
683 localRackNode = i;
684 if(tempIndex != 0 ) break;
685 }
686 }
687
688 // swap the local rack node and the node at position tempIndex
689 if(localRackNode != -1 && localRackNode != tempIndex ) {
690 swap(nodes, tempIndex, localRackNode);
691 tempIndex++;
692 }
693 }
694
695 // put a random node at position 0 if it is not a local/local-rack node
696 if(tempIndex == 0 && localRackNode == -1 && nodes.length != 0) {
697 swap(nodes, 0, r.nextInt(nodes.length));
698 }
699 }
700 }