001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.namenode;
019
020import static org.apache.hadoop.hdfs.server.common.Util.now;
021
022import java.io.File;
023import java.io.FilterInputStream;
024import java.io.IOException;
025import java.io.InputStream;
026import java.util.Arrays;
027import java.util.EnumMap;
028
029import org.apache.hadoop.fs.permission.PermissionStatus;
030import org.apache.hadoop.hdfs.protocol.HdfsConstants;
031import org.apache.hadoop.hdfs.protocol.LayoutVersion;
032import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature;
033import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
034import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction;
035import org.apache.hadoop.hdfs.server.common.Storage;
036import org.apache.hadoop.hdfs.server.namenode.EditLogFileInputStream.LogHeaderCorruptException;
037import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.AddCloseOp;
038import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.CancelDelegationTokenOp;
039import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.ClearNSQuotaOp;
040import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.ConcatDeleteOp;
041import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.DeleteOp;
042import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.GetDelegationTokenOp;
043import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.MkdirOp;
044import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.ReassignLeaseOp;
045import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.RenameOldOp;
046import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.RenameOp;
047import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.RenewDelegationTokenOp;
048import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetGenstampOp;
049import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetNSQuotaOp;
050import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetOwnerOp;
051import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetPermissionsOp;
052import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetQuotaOp;
053import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetReplicationOp;
054import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SymlinkOp;
055import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.TimesOp;
056import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.UpdateMasterKeyOp;
057import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease;
058import org.apache.hadoop.hdfs.util.Holder;
059import org.apache.hadoop.io.IOUtils;
060
061import com.google.common.base.Joiner;
062
063public class FSEditLogLoader {
064  private final FSNamesystem fsNamesys;
065
066  public FSEditLogLoader(FSNamesystem fsNamesys) {
067    this.fsNamesys = fsNamesys;
068  }
069  
070  /**
071   * Load an edit log, and apply the changes to the in-memory structure
072   * This is where we apply edits that we've been writing to disk all
073   * along.
074   */
075  int loadFSEdits(EditLogInputStream edits, long expectedStartingTxId)
076  throws IOException {
077    long startTime = now();
078    int numEdits = loadFSEdits(edits, true, expectedStartingTxId);
079    FSImage.LOG.info("Edits file " + edits.getName() 
080        + " of size " + edits.length() + " edits # " + numEdits 
081        + " loaded in " + (now()-startTime)/1000 + " seconds.");
082    return numEdits;
083  }
084
085  int loadFSEdits(EditLogInputStream edits, boolean closeOnExit,
086                  long expectedStartingTxId)
087      throws IOException {
088    int numEdits = 0;
089    int logVersion = edits.getVersion();
090
091    try {
092      numEdits = loadEditRecords(logVersion, edits, false, 
093                                 expectedStartingTxId);
094    } finally {
095      if(closeOnExit) {
096        edits.close();
097      }
098    }
099    
100    return numEdits;
101  }
102
103  @SuppressWarnings("deprecation")
104  int loadEditRecords(int logVersion, EditLogInputStream in, boolean closeOnExit,
105                      long expectedStartingTxId)
106      throws IOException {
107    FSDirectory fsDir = fsNamesys.dir;
108    int numEdits = 0;
109
110    EnumMap<FSEditLogOpCodes, Holder<Integer>> opCounts =
111      new EnumMap<FSEditLogOpCodes, Holder<Integer>>(FSEditLogOpCodes.class);
112
113    fsNamesys.writeLock();
114    fsDir.writeLock();
115
116    long recentOpcodeOffsets[] = new long[4];
117    Arrays.fill(recentOpcodeOffsets, -1);
118
119    try {
120      long txId = expectedStartingTxId - 1;
121
122      try {
123        FSEditLogOp op;
124        while ((op = in.readOp()) != null) {
125          recentOpcodeOffsets[numEdits % recentOpcodeOffsets.length] =
126            in.getPosition();
127          if (LayoutVersion.supports(Feature.STORED_TXIDS, logVersion)) {
128            long thisTxId = op.txid;
129            if (thisTxId != txId + 1) {
130              throw new IOException("Expected transaction ID " +
131                  (txId + 1) + " but got " + thisTxId);
132            }
133            txId = thisTxId;
134          }
135
136          numEdits++;
137          incrOpCount(op.opCode, opCounts);
138          switch (op.opCode) {
139          case OP_ADD:
140          case OP_CLOSE: {
141            AddCloseOp addCloseOp = (AddCloseOp)op;
142
143            // versions > 0 support per file replication
144            // get name and replication
145            final short replication  = fsNamesys.getBlockManager(
146                ).adjustReplication(addCloseOp.replication);
147
148            long blockSize = addCloseOp.blockSize;
149            BlockInfo blocks[] = new BlockInfo[addCloseOp.blocks.length];
150            for (int i = 0; i < addCloseOp.blocks.length; i++) {
151              if(addCloseOp.opCode == FSEditLogOpCodes.OP_ADD
152                 && i == addCloseOp.blocks.length-1) {
153                blocks[i] = new BlockInfoUnderConstruction(addCloseOp.blocks[i],
154                                                           replication);
155              } else {
156                blocks[i] = new BlockInfo(addCloseOp.blocks[i], replication);
157              }
158            }
159
160            PermissionStatus permissions = fsNamesys.getUpgradePermission();
161            if (addCloseOp.permissions != null) {
162              permissions = addCloseOp.permissions;
163            }
164
165
166            // Older versions of HDFS does not store the block size in inode.
167            // If the file has more than one block, use the size of the
168            // first block as the blocksize. Otherwise use the default
169            // block size.
170            if (-8 <= logVersion && blockSize == 0) {
171              if (blocks.length > 1) {
172                blockSize = blocks[0].getNumBytes();
173              } else {
174                long first = ((blocks.length == 1)? blocks[0].getNumBytes(): 0);
175                blockSize = Math.max(fsNamesys.getDefaultBlockSize(), first);
176              }
177            }
178
179
180            // The open lease transaction re-creates a file if necessary.
181            // Delete the file if it already exists.
182            if (FSNamesystem.LOG.isDebugEnabled()) {
183              FSNamesystem.LOG.debug(op.opCode + ": " + addCloseOp.path +
184                  " numblocks : " + blocks.length +
185                  " clientHolder " + addCloseOp.clientName +
186                  " clientMachine " + addCloseOp.clientMachine);
187            }
188
189            // There are four cases here:
190            // 1. OP_ADD to create a new file
191            // 2. OP_ADD to update file blocks
192            // 3. OP_ADD to open file for append
193            // 4. OP_CLOSE to close the file
194
195            // See if the file already exists
196            INodeFile oldFile = fsDir.getFileINode(addCloseOp.path);
197            if (oldFile == null) { // OP_ADD for a new file
198              assert addCloseOp.opCode == FSEditLogOpCodes.OP_ADD : 
199                "Expected opcode OP_ADD, but got " + addCloseOp.opCode;
200              fsDir.unprotectedAddFile(
201                  addCloseOp.path, permissions, blocks, replication,
202                  addCloseOp.mtime, addCloseOp.atime, blockSize,
203                  addCloseOp.clientName, addCloseOp.clientMachine);
204              fsNamesys.leaseManager.addLease(addCloseOp.clientName, 
205                  addCloseOp.path);
206            } else {
207              fsDir.updateFile(oldFile, addCloseOp.path, blocks,
208                  addCloseOp.mtime, addCloseOp.atime);
209              if(addCloseOp.opCode == FSEditLogOpCodes.OP_CLOSE) {  // OP_CLOSE
210                if (!oldFile.isUnderConstruction() &&
211                    logVersion <= LayoutVersion.BUGFIX_HDFS_2991_VERSION) {
212                  // There was a bug (HDFS-2991) in hadoop < 0.23.1 where OP_CLOSE
213                  // could show up twice in a row. But after that version, this
214                  // should be fixed, so we should treat it as an error.
215                  throw new IOException(
216                      "File is not under construction: " + addCloseOp.path);
217                }
218                fsNamesys.getBlockManager().completeBlock(
219                    oldFile, blocks.length-1, true);
220                
221                if (oldFile.isUnderConstruction()) {
222                  INodeFile newFile =
223                    ((INodeFileUnderConstruction)oldFile).convertToInodeFile();
224                  fsDir.replaceNode(addCloseOp.path, oldFile, newFile);
225                  fsNamesys.leaseManager.removeLease(
226                      ((INodeFileUnderConstruction)oldFile).getClientName(),
227                      addCloseOp.path);
228                }
229              } else if(! oldFile.isUnderConstruction()) {  // OP_ADD for append
230                fsNamesys.prepareFileForWrite(addCloseOp.path, oldFile,
231                    addCloseOp.clientName, addCloseOp.clientMachine, null,
232                    false);
233              }
234            }
235            break;
236          }
237          case OP_SET_REPLICATION: {
238            SetReplicationOp setReplicationOp = (SetReplicationOp)op;
239            short replication = fsNamesys.getBlockManager().adjustReplication(
240                setReplicationOp.replication);
241            fsDir.unprotectedSetReplication(setReplicationOp.path,
242                                            replication, null);
243            break;
244          }
245          case OP_CONCAT_DELETE: {
246            ConcatDeleteOp concatDeleteOp = (ConcatDeleteOp)op;
247            fsDir.unprotectedConcat(concatDeleteOp.trg, concatDeleteOp.srcs,
248                concatDeleteOp.timestamp);
249            break;
250          }
251          case OP_RENAME_OLD: {
252            RenameOldOp renameOp = (RenameOldOp)op;
253            fsDir.unprotectedRenameTo(renameOp.src, renameOp.dst,
254                                      renameOp.timestamp);
255            break;
256          }
257          case OP_DELETE: {
258            DeleteOp deleteOp = (DeleteOp)op;
259            fsDir.unprotectedDelete(deleteOp.path, deleteOp.timestamp);
260            break;
261          }
262          case OP_MKDIR: {
263            MkdirOp mkdirOp = (MkdirOp)op;
264            PermissionStatus permissions = fsNamesys.getUpgradePermission();
265            if (mkdirOp.permissions != null) {
266              permissions = mkdirOp.permissions;
267            }
268
269            fsDir.unprotectedMkdir(mkdirOp.path, permissions,
270                                   mkdirOp.timestamp);
271            break;
272          }
273          case OP_SET_GENSTAMP: {
274            SetGenstampOp setGenstampOp = (SetGenstampOp)op;
275            fsNamesys.setGenerationStamp(setGenstampOp.genStamp);
276            break;
277          }
278          case OP_SET_PERMISSIONS: {
279            SetPermissionsOp setPermissionsOp = (SetPermissionsOp)op;
280            fsDir.unprotectedSetPermission(setPermissionsOp.src,
281                                           setPermissionsOp.permissions);
282            break;
283          }
284          case OP_SET_OWNER: {
285            SetOwnerOp setOwnerOp = (SetOwnerOp)op;
286            fsDir.unprotectedSetOwner(setOwnerOp.src, setOwnerOp.username,
287                                      setOwnerOp.groupname);
288            break;
289          }
290          case OP_SET_NS_QUOTA: {
291            SetNSQuotaOp setNSQuotaOp = (SetNSQuotaOp)op;
292            fsDir.unprotectedSetQuota(setNSQuotaOp.src,
293                                      setNSQuotaOp.nsQuota,
294                                      HdfsConstants.QUOTA_DONT_SET);
295            break;
296          }
297          case OP_CLEAR_NS_QUOTA: {
298            ClearNSQuotaOp clearNSQuotaOp = (ClearNSQuotaOp)op;
299            fsDir.unprotectedSetQuota(clearNSQuotaOp.src,
300                                      HdfsConstants.QUOTA_RESET,
301                                      HdfsConstants.QUOTA_DONT_SET);
302            break;
303          }
304
305          case OP_SET_QUOTA:
306            SetQuotaOp setQuotaOp = (SetQuotaOp)op;
307            fsDir.unprotectedSetQuota(setQuotaOp.src,
308                                      setQuotaOp.nsQuota,
309                                      setQuotaOp.dsQuota);
310            break;
311
312          case OP_TIMES: {
313            TimesOp timesOp = (TimesOp)op;
314
315            fsDir.unprotectedSetTimes(timesOp.path,
316                                      timesOp.mtime,
317                                      timesOp.atime, true);
318            break;
319          }
320          case OP_SYMLINK: {
321            SymlinkOp symlinkOp = (SymlinkOp)op;
322            fsDir.unprotectedSymlink(symlinkOp.path, symlinkOp.value,
323                                     symlinkOp.mtime, symlinkOp.atime,
324                                     symlinkOp.permissionStatus);
325            break;
326          }
327          case OP_RENAME: {
328            RenameOp renameOp = (RenameOp)op;
329
330            fsDir.unprotectedRenameTo(renameOp.src, renameOp.dst,
331                                      renameOp.timestamp, renameOp.options);
332            break;
333          }
334          case OP_GET_DELEGATION_TOKEN: {
335            GetDelegationTokenOp getDelegationTokenOp
336              = (GetDelegationTokenOp)op;
337
338            fsNamesys.getDelegationTokenSecretManager()
339              .addPersistedDelegationToken(getDelegationTokenOp.token,
340                                           getDelegationTokenOp.expiryTime);
341            break;
342          }
343          case OP_RENEW_DELEGATION_TOKEN: {
344            RenewDelegationTokenOp renewDelegationTokenOp
345              = (RenewDelegationTokenOp)op;
346            fsNamesys.getDelegationTokenSecretManager()
347              .updatePersistedTokenRenewal(renewDelegationTokenOp.token,
348                                           renewDelegationTokenOp.expiryTime);
349            break;
350          }
351          case OP_CANCEL_DELEGATION_TOKEN: {
352            CancelDelegationTokenOp cancelDelegationTokenOp
353              = (CancelDelegationTokenOp)op;
354            fsNamesys.getDelegationTokenSecretManager()
355                .updatePersistedTokenCancellation(
356                    cancelDelegationTokenOp.token);
357            break;
358          }
359          case OP_UPDATE_MASTER_KEY: {
360            UpdateMasterKeyOp updateMasterKeyOp = (UpdateMasterKeyOp)op;
361            fsNamesys.getDelegationTokenSecretManager()
362              .updatePersistedMasterKey(updateMasterKeyOp.key);
363            break;
364          }
365          case OP_REASSIGN_LEASE: {
366            ReassignLeaseOp reassignLeaseOp = (ReassignLeaseOp)op;
367
368            Lease lease = fsNamesys.leaseManager.getLease(
369                reassignLeaseOp.leaseHolder);
370            INodeFileUnderConstruction pendingFile =
371                (INodeFileUnderConstruction) fsDir.getFileINode(
372                    reassignLeaseOp.path);
373            fsNamesys.reassignLeaseInternal(lease,
374                reassignLeaseOp.path, reassignLeaseOp.newHolder, pendingFile);
375            break;
376          }
377          case OP_START_LOG_SEGMENT:
378          case OP_END_LOG_SEGMENT: {
379            // no data in here currently.
380            break;
381          }
382          case OP_DATANODE_ADD:
383          case OP_DATANODE_REMOVE:
384            break;
385          default:
386            throw new IOException("Invalid operation read " + op.opCode);
387          }
388        }
389
390      } catch (IOException ex) {
391        check203UpgradeFailure(logVersion, ex);
392      } finally {
393        if(closeOnExit)
394          in.close();
395      }
396    } catch (Throwable t) {
397      // Catch Throwable because in the case of a truly corrupt edits log, any
398      // sort of error might be thrown (NumberFormat, NullPointer, EOF, etc.)
399      StringBuilder sb = new StringBuilder();
400      sb.append("Error replaying edit log at offset " + in.getPosition());
401      if (recentOpcodeOffsets[0] != -1) {
402        Arrays.sort(recentOpcodeOffsets);
403        sb.append("\nRecent opcode offsets:");
404        for (long offset : recentOpcodeOffsets) {
405          if (offset != -1) {
406            sb.append(' ').append(offset);
407          }
408        }
409      }
410      String errorMessage = sb.toString();
411      FSImage.LOG.error(errorMessage);
412      throw new IOException(errorMessage, t);
413    } finally {
414      fsDir.writeUnlock();
415      fsNamesys.writeUnlock();
416    }
417    if (FSImage.LOG.isDebugEnabled()) {
418      dumpOpCounts(opCounts);
419    }
420    return numEdits;
421  }
422
423
424  private static void dumpOpCounts(
425      EnumMap<FSEditLogOpCodes, Holder<Integer>> opCounts) {
426    StringBuilder sb = new StringBuilder();
427    sb.append("Summary of operations loaded from edit log:\n  ");
428    Joiner.on("\n  ").withKeyValueSeparator("=").appendTo(sb, opCounts);
429    FSImage.LOG.debug(sb.toString());
430  }
431
432  private void incrOpCount(FSEditLogOpCodes opCode,
433      EnumMap<FSEditLogOpCodes, Holder<Integer>> opCounts) {
434    Holder<Integer> holder = opCounts.get(opCode);
435    if (holder == null) {
436      holder = new Holder<Integer>(1);
437      opCounts.put(opCode, holder);
438    } else {
439      holder.held++;
440    }
441  }
442
443  /**
444   * Throw appropriate exception during upgrade from 203, when editlog loading
445   * could fail due to opcode conflicts.
446   */
447  private void check203UpgradeFailure(int logVersion, IOException ex)
448      throws IOException {
449    // 0.20.203 version version has conflicting opcodes with the later releases.
450    // The editlog must be emptied by restarting the namenode, before proceeding
451    // with the upgrade.
452    if (Storage.is203LayoutVersion(logVersion)
453        && logVersion != HdfsConstants.LAYOUT_VERSION) {
454      String msg = "During upgrade failed to load the editlog version "
455          + logVersion + " from release 0.20.203. Please go back to the old "
456          + " release and restart the namenode. This empties the editlog "
457          + " and saves the namespace. Resume the upgrade after this step.";
458      throw new IOException(msg, ex);
459    } else {
460      throw ex;
461    }
462  }
463  
464  static EditLogValidation validateEditLog(File file) throws IOException {
465    EditLogFileInputStream in;
466    try {
467      in = new EditLogFileInputStream(file);
468    } catch (LogHeaderCorruptException corrupt) {
469      // If it's missing its header, this is equivalent to no transactions
470      FSImage.LOG.warn("Log at " + file + " has no valid header",
471          corrupt);
472      return new EditLogValidation(0, 0);
473    }
474    
475    try {
476      return validateEditLog(in);
477    } finally {
478      IOUtils.closeStream(in);
479    }
480  }
481
482  /**
483   * Return the number of valid transactions in the stream. If the stream is
484   * truncated during the header, returns a value indicating that there are
485   * 0 valid transactions. This reads through the stream but does not close
486   * it.
487   * @throws IOException if the stream cannot be read due to an IO error (eg
488   *                     if the log does not exist)
489   */
490  static EditLogValidation validateEditLog(EditLogInputStream in) {
491    long numValid = 0;
492    long lastPos = 0;
493    try {
494      while (true) {
495        lastPos = in.getPosition();
496        if (in.readOp() == null) {
497          break;
498        }
499        numValid++;
500      }
501    } catch (Throwable t) {
502      // Catch Throwable and not just IOE, since bad edits may generate
503      // NumberFormatExceptions, AssertionErrors, OutOfMemoryErrors, etc.
504      FSImage.LOG.debug("Caught exception after reading " + numValid +
505          " ops from " + in + " while determining its valid length.", t);
506    }
507    return new EditLogValidation(lastPos, numValid);
508  }
509  
510  static class EditLogValidation {
511    long validLength;
512    long numTransactions;
513    
514    EditLogValidation(long validLength, long numTransactions) {
515      this.validLength = validLength;
516      this.numTransactions = numTransactions;
517    }
518  }
519
520  /**
521   * Stream wrapper that keeps track of the current stream position.
522   */
523  static class PositionTrackingInputStream extends FilterInputStream {
524    private long curPos = 0;
525    private long markPos = -1;
526
527    public PositionTrackingInputStream(InputStream is) {
528      super(is);
529    }
530
531    public int read() throws IOException {
532      int ret = super.read();
533      if (ret != -1) curPos++;
534      return ret;
535    }
536
537    public int read(byte[] data) throws IOException {
538      int ret = super.read(data);
539      if (ret > 0) curPos += ret;
540      return ret;
541    }
542
543    public int read(byte[] data, int offset, int length) throws IOException {
544      int ret = super.read(data, offset, length);
545      if (ret > 0) curPos += ret;
546      return ret;
547    }
548
549    public void mark(int limit) {
550      super.mark(limit);
551      markPos = curPos;
552    }
553
554    public void reset() throws IOException {
555      if (markPos == -1) {
556        throw new IOException("Not marked!");
557      }
558      super.reset();
559      curPos = markPos;
560      markPos = -1;
561    }
562
563    public long getPos() {
564      return curPos;
565    }
566  }
567
568}