001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.namenode;
019
020import static org.apache.hadoop.crypto.key.KeyProviderCryptoExtension.EncryptedKeyVersion;
021import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_DEFAULT;
022import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_KEY;
023import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_DEFAULT;
024import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_KEY;
025import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_DEFAULT;
026import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_KEY;
027import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_DEFAULT;
028import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY;
029import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CHECKSUM_TYPE_DEFAULT;
030import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CHECKSUM_TYPE_KEY;
031import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_WRITE_PACKET_SIZE_DEFAULT;
032import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_WRITE_PACKET_SIZE_KEY;
033import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_DEFAULT;
034import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_KEY;
035import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_DEFAULT;
036import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_KEY;
037import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT;
038import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_KEY;
039import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOGGERS_KEY;
040import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_ASYNC_DEFAULT;
041import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_ASYNC_KEY;
042import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT;
043import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY;
044import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT;
045import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY;
046import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME;
047import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT;
048import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY;
049import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT;
050import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY;
051import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT;
052import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY;
053import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT;
054import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY;
055import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY;
056import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY;
057import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS;
058import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS_DEFAULT;
059import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD;
060import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD_DEFAULT;
061import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_DEFAULT;
062import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY;
063import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC;
064import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC_DEFAULT;
065import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_INODE_ATTRIBUTES_PROVIDER_KEY;
066import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_DEFAULT;
067import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_KEY;
068import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY;
069import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_DEFAULT;
070import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY;
071import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY;
072import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT;
073import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY;
074import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_DEFAULT;
075import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_KEY;
076import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_DEFAULT;
077import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_KEY;
078import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY;
079import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT;
080import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY;
081import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT;
082import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY;
083import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY;
084import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_DEFAULT;
085import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY;
086import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT;
087import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_KEY;
088import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_DEFAULT;
089import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_KEY;
090import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SUPPORT_APPEND_DEFAULT;
091import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SUPPORT_APPEND_KEY;
092import static org.apache.hadoop.hdfs.server.common.HdfsServerConstants.SECURITY_XATTR_UNREADABLE_BY_SUPERUSER;
093import static org.apache.hadoop.util.Time.now;
094import static org.apache.hadoop.util.Time.monotonicNow;
095
096import java.io.BufferedWriter;
097import java.io.ByteArrayInputStream;
098import java.io.DataInput;
099import java.io.DataInputStream;
100import java.io.DataOutputStream;
101import java.io.File;
102import java.io.FileNotFoundException;
103import java.io.FileOutputStream;
104import java.io.IOException;
105import java.io.OutputStreamWriter;
106import java.io.PrintWriter;
107import java.io.StringWriter;
108import java.lang.management.ManagementFactory;
109import java.net.InetAddress;
110import java.net.URI;
111import java.security.GeneralSecurityException;
112import java.util.ArrayList;
113import java.util.Arrays;
114import java.util.Collection;
115import java.util.Collections;
116import java.util.Date;
117import java.util.EnumSet;
118import java.util.HashMap;
119import java.util.HashSet;
120import java.util.Iterator;
121import java.util.LinkedHashSet;
122import java.util.List;
123import java.util.Map;
124import java.util.Set;
125import java.util.TreeMap;
126import java.util.concurrent.TimeUnit;
127import java.util.concurrent.locks.Condition;
128import java.util.concurrent.locks.ReentrantLock;
129import java.util.concurrent.locks.ReentrantReadWriteLock;
130
131import javax.management.NotCompliantMBeanException;
132import javax.management.ObjectName;
133import javax.management.StandardMBean;
134
135import org.apache.commons.logging.Log;
136import org.apache.commons.logging.LogFactory;
137import org.apache.commons.logging.impl.Log4JLogger;
138import org.apache.hadoop.HadoopIllegalArgumentException;
139import org.apache.hadoop.classification.InterfaceAudience;
140import org.apache.hadoop.conf.Configuration;
141import org.apache.hadoop.crypto.CipherSuite;
142import org.apache.hadoop.crypto.CryptoProtocolVersion;
143import org.apache.hadoop.crypto.key.KeyProvider;
144import org.apache.hadoop.crypto.CryptoCodec;
145import org.apache.hadoop.crypto.key.KeyProviderCryptoExtension;
146import org.apache.hadoop.fs.BatchedRemoteIterator.BatchedListEntries;
147import org.apache.hadoop.fs.CacheFlag;
148import org.apache.hadoop.fs.ContentSummary;
149import org.apache.hadoop.fs.CreateFlag;
150import org.apache.hadoop.fs.FileAlreadyExistsException;
151import org.apache.hadoop.fs.FileEncryptionInfo;
152import org.apache.hadoop.fs.FileStatus;
153import org.apache.hadoop.fs.FileSystem;
154import org.apache.hadoop.fs.FsServerDefaults;
155import org.apache.hadoop.fs.InvalidPathException;
156import org.apache.hadoop.fs.Options;
157import org.apache.hadoop.fs.ParentNotDirectoryException;
158import org.apache.hadoop.fs.Path;
159import org.apache.hadoop.fs.UnresolvedLinkException;
160import org.apache.hadoop.fs.XAttr;
161import org.apache.hadoop.fs.XAttrSetFlag;
162import org.apache.hadoop.fs.permission.AclEntry;
163import org.apache.hadoop.fs.permission.AclStatus;
164import org.apache.hadoop.fs.permission.FsAction;
165import org.apache.hadoop.fs.permission.FsPermission;
166import org.apache.hadoop.fs.permission.PermissionStatus;
167import org.apache.hadoop.fs.StorageType;
168import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
169import org.apache.hadoop.ha.ServiceFailedException;
170import org.apache.hadoop.hdfs.protocol.BlockStoragePolicy;
171import org.apache.hadoop.hdfs.DFSConfigKeys;
172import org.apache.hadoop.hdfs.DFSUtil;
173import org.apache.hadoop.hdfs.HAUtil;
174import org.apache.hadoop.hdfs.HdfsConfiguration;
175import org.apache.hadoop.hdfs.UnknownCryptoProtocolVersionException;
176import org.apache.hadoop.hdfs.XAttrHelper;
177import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException;
178import org.apache.hadoop.hdfs.protocol.Block;
179import org.apache.hadoop.hdfs.protocol.CacheDirectiveEntry;
180import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo;
181import org.apache.hadoop.hdfs.protocol.CachePoolEntry;
182import org.apache.hadoop.hdfs.protocol.CachePoolInfo;
183import org.apache.hadoop.hdfs.protocol.ClientProtocol;
184import org.apache.hadoop.hdfs.protocol.DatanodeID;
185import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
186import org.apache.hadoop.hdfs.protocol.DirectoryListing;
187import org.apache.hadoop.hdfs.protocol.EncryptionZone;
188import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
189import org.apache.hadoop.hdfs.protocol.HdfsConstants;
190import org.apache.hadoop.hdfs.protocol.LastBlockWithStatus;
191import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
192import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
193import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
194import org.apache.hadoop.hdfs.protocol.LocatedBlock;
195import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
196import org.apache.hadoop.hdfs.protocol.QuotaExceededException;
197import org.apache.hadoop.hdfs.protocol.RecoveryInProgressException;
198import org.apache.hadoop.hdfs.protocol.RollingUpgradeException;
199import org.apache.hadoop.hdfs.protocol.RollingUpgradeInfo;
200import org.apache.hadoop.hdfs.protocol.SnapshotAccessControlException;
201import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport;
202import org.apache.hadoop.hdfs.protocol.SnapshottableDirectoryStatus;
203import org.apache.hadoop.hdfs.protocol.datatransfer.ReplaceDatanodeOnFailure;
204import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager;
205import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager.AccessMode;
206import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier;
207import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager;
208import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager.SecretManagerState;
209import org.apache.hadoop.hdfs.server.blockmanagement.BlockCollection;
210import org.apache.hadoop.hdfs.server.blockmanagement.BlockIdManager;
211import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoContiguous;
212import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoContiguousUnderConstruction;
213import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
214import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
215import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager;
216import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStatistics;
217import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo;
218import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState;
219import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
220import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.RollingUpgradeStartupOption;
221import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
222import org.apache.hadoop.hdfs.server.common.Storage;
223import org.apache.hadoop.hdfs.server.common.Storage.StorageDirType;
224import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
225import org.apache.hadoop.hdfs.server.common.Util;
226import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SecretManagerSection;
227import org.apache.hadoop.hdfs.server.namenode.INode.BlocksMapUpdateInfo;
228import org.apache.hadoop.hdfs.server.namenode.JournalSet.JournalAndStream;
229import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease;
230import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile;
231import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory;
232import org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer;
233import org.apache.hadoop.hdfs.server.namenode.ha.HAContext;
234import org.apache.hadoop.hdfs.server.namenode.ha.StandbyCheckpointer;
235import org.apache.hadoop.hdfs.server.namenode.metrics.FSNamesystemMBean;
236import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
237import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
238import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotManager;
239import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
240import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
241import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter;
242import org.apache.hadoop.hdfs.server.namenode.startupprogress.Status;
243import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
244import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType;
245import org.apache.hadoop.hdfs.server.namenode.top.TopAuditLogger;
246import org.apache.hadoop.hdfs.server.namenode.top.TopConf;
247import org.apache.hadoop.hdfs.server.namenode.top.metrics.TopMetrics;
248import org.apache.hadoop.hdfs.server.namenode.top.window.RollingWindowManager;
249import org.apache.hadoop.hdfs.server.namenode.web.resources.NamenodeWebHdfsMethods;
250import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
251import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
252import org.apache.hadoop.hdfs.server.protocol.DatanodeStorageReport;
253import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse;
254import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat;
255import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand;
256import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
257import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
258import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks;
259import org.apache.hadoop.hdfs.server.protocol.StorageReport;
260import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
261import org.apache.hadoop.io.EnumSetWritable;
262import org.apache.hadoop.io.IOUtils;
263import org.apache.hadoop.io.Text;
264import org.apache.hadoop.ipc.RetriableException;
265import org.apache.hadoop.ipc.RetryCache;
266import org.apache.hadoop.ipc.Server;
267import org.apache.hadoop.ipc.StandbyException;
268import org.apache.hadoop.metrics2.annotation.Metric;
269import org.apache.hadoop.metrics2.annotation.Metrics;
270import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
271import org.apache.hadoop.metrics2.util.MBeans;
272import org.apache.hadoop.net.NetworkTopology;
273import org.apache.hadoop.net.Node;
274import org.apache.hadoop.net.NodeBase;
275import org.apache.hadoop.security.AccessControlException;
276import org.apache.hadoop.security.UserGroupInformation;
277import org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod;
278import org.apache.hadoop.security.token.SecretManager.InvalidToken;
279import org.apache.hadoop.security.token.Token;
280import org.apache.hadoop.security.token.TokenIdentifier;
281import org.apache.hadoop.security.token.delegation.DelegationKey;
282import org.apache.hadoop.util.ChunkedArrayList;
283import org.apache.hadoop.util.Daemon;
284import org.apache.hadoop.util.DataChecksum;
285import org.apache.hadoop.util.ReflectionUtils;
286import org.apache.hadoop.util.StringUtils;
287import org.apache.hadoop.util.VersionInfo;
288import org.apache.log4j.Appender;
289import org.apache.log4j.AsyncAppender;
290import org.apache.log4j.Logger;
291import org.codehaus.jackson.map.ObjectMapper;
292import org.mortbay.util.ajax.JSON;
293
294import com.google.common.annotations.VisibleForTesting;
295import com.google.common.base.Charsets;
296import com.google.common.base.Preconditions;
297import com.google.common.collect.ImmutableMap;
298import com.google.common.collect.Lists;
299
300/***************************************************
301 * FSNamesystem does the actual bookkeeping work for the
302 * DataNode.
303 *
304 * It tracks several important tables.
305 *
306 * 1)  valid fsname --> blocklist  (kept on disk, logged)
307 * 2)  Set of all valid blocks (inverted #1)
308 * 3)  block --> machinelist (kept in memory, rebuilt dynamically from reports)
309 * 4)  machine --> blocklist (inverted #2)
310 * 5)  LRU cache of updated-heartbeat machines
311 ***************************************************/
312@InterfaceAudience.Private
313@Metrics(context="dfs")
314public class FSNamesystem implements Namesystem, FSNamesystemMBean,
315  NameNodeMXBean {
316  public static final Log LOG = LogFactory.getLog(FSNamesystem.class);
317
318  private static final ThreadLocal<StringBuilder> auditBuffer =
319    new ThreadLocal<StringBuilder>() {
320      @Override
321      protected StringBuilder initialValue() {
322        return new StringBuilder();
323      }
324  };
325
326  private final BlockIdManager blockIdManager;
327
328  @VisibleForTesting
329  public boolean isAuditEnabled() {
330    return !isDefaultAuditLogger || auditLog.isInfoEnabled();
331  }
332
333  private void logAuditEvent(boolean succeeded, String cmd, String src)
334      throws IOException {
335    logAuditEvent(succeeded, cmd, src, null, null);
336  }
337  
338  private void logAuditEvent(boolean succeeded, String cmd, String src,
339      String dst, HdfsFileStatus stat) throws IOException {
340    if (isAuditEnabled() && isExternalInvocation()) {
341      logAuditEvent(succeeded, getRemoteUser(), getRemoteIp(),
342                    cmd, src, dst, stat);
343    }
344  }
345
346  private void logAuditEvent(boolean succeeded,
347      UserGroupInformation ugi, InetAddress addr, String cmd, String src,
348      String dst, HdfsFileStatus stat) {
349    FileStatus status = null;
350    if (stat != null) {
351      Path symlink = stat.isSymlink() ? new Path(stat.getSymlink()) : null;
352      Path path = dst != null ? new Path(dst) : new Path(src);
353      status = new FileStatus(stat.getLen(), stat.isDir(),
354          stat.getReplication(), stat.getBlockSize(), stat.getModificationTime(),
355          stat.getAccessTime(), stat.getPermission(), stat.getOwner(),
356          stat.getGroup(), symlink, path);
357    }
358    for (AuditLogger logger : auditLoggers) {
359      if (logger instanceof HdfsAuditLogger) {
360        HdfsAuditLogger hdfsLogger = (HdfsAuditLogger) logger;
361        hdfsLogger.logAuditEvent(succeeded, ugi.toString(), addr, cmd, src, dst,
362            status, ugi, dtSecretManager);
363      } else {
364        logger.logAuditEvent(succeeded, ugi.toString(), addr,
365            cmd, src, dst, status);
366      }
367    }
368  }
369
370  /**
371   * Logger for audit events, noting successful FSNamesystem operations. Emits
372   * to FSNamesystem.audit at INFO. Each event causes a set of tab-separated
373   * <code>key=value</code> pairs to be written for the following properties:
374   * <code>
375   * ugi=&lt;ugi in RPC&gt;
376   * ip=&lt;remote IP&gt;
377   * cmd=&lt;command&gt;
378   * src=&lt;src path&gt;
379   * dst=&lt;dst path (optional)&gt;
380   * perm=&lt;permissions (optional)&gt;
381   * </code>
382   */
383  public static final Log auditLog = LogFactory.getLog(
384      FSNamesystem.class.getName() + ".audit");
385
386  static final int DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED = 100;
387  static int BLOCK_DELETION_INCREMENT = 1000;
388  private final boolean isPermissionEnabled;
389  private final UserGroupInformation fsOwner;
390  private final String supergroup;
391  private final boolean standbyShouldCheckpoint;
392  
393  // Scan interval is not configurable.
394  private static final long DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL =
395    TimeUnit.MILLISECONDS.convert(1, TimeUnit.HOURS);
396  final DelegationTokenSecretManager dtSecretManager;
397  private final boolean alwaysUseDelegationTokensForTests;
398
399  private static final Step STEP_AWAITING_REPORTED_BLOCKS =
400    new Step(StepType.AWAITING_REPORTED_BLOCKS);
401
402  // Tracks whether the default audit logger is the only configured audit
403  // logger; this allows isAuditEnabled() to return false in case the
404  // underlying logger is disabled, and avoid some unnecessary work.
405  private final boolean isDefaultAuditLogger;
406  private final List<AuditLogger> auditLoggers;
407
408  /** The namespace tree. */
409  FSDirectory dir;
410  private final BlockManager blockManager;
411  private final SnapshotManager snapshotManager;
412  private final CacheManager cacheManager;
413  private final DatanodeStatistics datanodeStatistics;
414
415  private String nameserviceId;
416
417  private volatile RollingUpgradeInfo rollingUpgradeInfo = null;
418  /**
419   * A flag that indicates whether the checkpointer should checkpoint a rollback
420   * fsimage. The edit log tailer sets this flag. The checkpoint will create a
421   * rollback fsimage if the flag is true, and then change the flag to false.
422   */
423  private volatile boolean needRollbackFsImage;
424
425  // Block pool ID used by this namenode
426  private String blockPoolId;
427
428  final LeaseManager leaseManager = new LeaseManager(this); 
429
430  volatile Daemon smmthread = null;  // SafeModeMonitor thread
431  
432  Daemon nnrmthread = null; // NamenodeResourceMonitor thread
433
434  Daemon nnEditLogRoller = null; // NameNodeEditLogRoller thread
435
436  // A daemon to periodically clean up corrupt lazyPersist files
437  // from the name space.
438  Daemon lazyPersistFileScrubber = null;
439  /**
440   * When an active namenode will roll its own edit log, in # edits
441   */
442  private final long editLogRollerThreshold;
443  /**
444   * Check interval of an active namenode's edit log roller thread 
445   */
446  private final int editLogRollerInterval;
447
448  /**
449   * How frequently we scan and unlink corrupt lazyPersist files.
450   * (In seconds)
451   */
452  private final int lazyPersistFileScrubIntervalSec;
453
454  private volatile boolean hasResourcesAvailable = false;
455  private volatile boolean fsRunning = true;
456  
457  /** The start time of the namesystem. */
458  private final long startTime = now();
459
460  /** The interval of namenode checking for the disk space availability */
461  private final long resourceRecheckInterval;
462
463  // The actual resource checker instance.
464  NameNodeResourceChecker nnResourceChecker;
465
466  private final FsServerDefaults serverDefaults;
467  private final boolean supportAppends;
468  private final ReplaceDatanodeOnFailure dtpReplaceDatanodeOnFailure;
469
470  private volatile SafeModeInfo safeMode;  // safe mode information
471
472  private final long maxFsObjects;          // maximum number of fs objects
473
474  private final long minBlockSize;         // minimum block size
475  private final long maxBlocksPerFile;     // maximum # of blocks per file
476
477  // precision of access times.
478  private final long accessTimePrecision;
479
480  /** Lock to protect FSNamesystem. */
481  private final FSNamesystemLock fsLock;
482
483  /** 
484   * Checkpoint lock to protect FSNamesystem modification on standby NNs.
485   * Unlike fsLock, it does not affect block updates. On active NNs, this lock
486   * does not provide proper protection, because there are operations that
487   * modify both block and name system state.  Even on standby, fsLock is 
488   * used when block state changes need to be blocked.
489   */
490  private final ReentrantLock cpLock;
491
492  /**
493   * Used when this NN is in standby state to read from the shared edit log.
494   */
495  private EditLogTailer editLogTailer = null;
496
497  /**
498   * Used when this NN is in standby state to perform checkpoints.
499   */
500  private StandbyCheckpointer standbyCheckpointer;
501
502  /**
503   * Reference to the NN's HAContext object. This is only set once
504   * {@link #startCommonServices(Configuration, HAContext)} is called. 
505   */
506  private HAContext haContext;
507
508  private final boolean haEnabled;
509
510  /** flag indicating whether replication queues have been initialized */
511  boolean initializedReplQueues = false;
512
513  /**
514   * Whether the namenode is in the middle of starting the active service
515   */
516  private volatile boolean startingActiveService = false;
517
518  private final RetryCache retryCache;
519
520  private KeyProviderCryptoExtension provider = null;
521
522  private volatile boolean imageLoaded = false;
523  private final Condition cond;
524
525  private final FSImage fsImage;
526
527  private final TopConf topConf;
528  private TopMetrics topMetrics;
529
530  private INodeAttributeProvider inodeAttributeProvider;
531
532  /**
533   * Notify that loading of this FSDirectory is complete, and
534   * it is imageLoaded for use
535   */
536  void imageLoadComplete() {
537    Preconditions.checkState(!imageLoaded, "FSDirectory already loaded");
538    setImageLoaded();
539  }
540
541  void setImageLoaded() {
542    if(imageLoaded) return;
543    writeLock();
544    try {
545      setImageLoaded(true);
546      dir.markNameCacheInitialized();
547      cond.signalAll();
548    } finally {
549      writeUnlock();
550    }
551  }
552
553  //This is for testing purposes only
554  @VisibleForTesting
555  boolean isImageLoaded() {
556    return imageLoaded;
557  }
558
559  // exposed for unit tests
560  protected void setImageLoaded(boolean flag) {
561    imageLoaded = flag;
562  }
563
564  /**
565   * Block until the object is imageLoaded to be used.
566   */
567  void waitForLoadingFSImage() {
568    if (!imageLoaded) {
569      writeLock();
570      try {
571        while (!imageLoaded) {
572          try {
573            cond.await(5000, TimeUnit.MILLISECONDS);
574          } catch (InterruptedException ignored) {
575          }
576        }
577      } finally {
578        writeUnlock();
579      }
580    }
581  }
582
583  /**
584   * Clear all loaded data
585   */
586  void clear() {
587    dir.reset();
588    dtSecretManager.reset();
589    blockIdManager.clear();
590    leaseManager.removeAllLeases();
591    snapshotManager.clearSnapshottableDirs();
592    cacheManager.clear();
593    setImageLoaded(false);
594    blockManager.clear();
595  }
596
597  @VisibleForTesting
598  LeaseManager getLeaseManager() {
599    return leaseManager;
600  }
601  
602  boolean isHaEnabled() {
603    return haEnabled;
604  }
605  
606  /**
607   * Check the supplied configuration for correctness.
608   * @param conf Supplies the configuration to validate.
609   * @throws IOException if the configuration could not be queried.
610   * @throws IllegalArgumentException if the configuration is invalid.
611   */
612  private static void checkConfiguration(Configuration conf)
613      throws IOException {
614
615    final Collection<URI> namespaceDirs =
616        FSNamesystem.getNamespaceDirs(conf);
617    final Collection<URI> editsDirs =
618        FSNamesystem.getNamespaceEditsDirs(conf);
619    final Collection<URI> requiredEditsDirs =
620        FSNamesystem.getRequiredNamespaceEditsDirs(conf);
621    final Collection<URI> sharedEditsDirs =
622        FSNamesystem.getSharedEditsDirs(conf);
623
624    for (URI u : requiredEditsDirs) {
625      if (u.toString().compareTo(
626              DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_DEFAULT) == 0) {
627        continue;
628      }
629
630      // Each required directory must also be in editsDirs or in
631      // sharedEditsDirs.
632      if (!editsDirs.contains(u) &&
633          !sharedEditsDirs.contains(u)) {
634        throw new IllegalArgumentException(
635            "Required edits directory " + u.toString() + " not present in " +
636            DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY + ". " +
637            DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY + "=" +
638            editsDirs.toString() + "; " +
639            DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY + "=" +
640            requiredEditsDirs.toString() + ". " +
641            DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY + "=" +
642            sharedEditsDirs.toString() + ".");
643      }
644    }
645
646    if (namespaceDirs.size() == 1) {
647      LOG.warn("Only one image storage directory ("
648          + DFS_NAMENODE_NAME_DIR_KEY + ") configured. Beware of data loss"
649          + " due to lack of redundant storage directories!");
650    }
651    if (editsDirs.size() == 1) {
652      LOG.warn("Only one namespace edits storage directory ("
653          + DFS_NAMENODE_EDITS_DIR_KEY + ") configured. Beware of data loss"
654          + " due to lack of redundant storage directories!");
655    }
656  }
657
658  /**
659   * Instantiates an FSNamesystem loaded from the image and edits
660   * directories specified in the passed Configuration.
661   *
662   * @param conf the Configuration which specifies the storage directories
663   *             from which to load
664   * @return an FSNamesystem which contains the loaded namespace
665   * @throws IOException if loading fails
666   */
667  static FSNamesystem loadFromDisk(Configuration conf) throws IOException {
668
669    checkConfiguration(conf);
670    FSImage fsImage = new FSImage(conf,
671        FSNamesystem.getNamespaceDirs(conf),
672        FSNamesystem.getNamespaceEditsDirs(conf));
673    FSNamesystem namesystem = new FSNamesystem(conf, fsImage, false);
674    StartupOption startOpt = NameNode.getStartupOption(conf);
675    if (startOpt == StartupOption.RECOVER) {
676      namesystem.setSafeMode(SafeModeAction.SAFEMODE_ENTER);
677    }
678
679    long loadStart = monotonicNow();
680    try {
681      namesystem.loadFSImage(startOpt);
682    } catch (IOException ioe) {
683      LOG.warn("Encountered exception loading fsimage", ioe);
684      fsImage.close();
685      throw ioe;
686    }
687    long timeTakenToLoadFSImage = monotonicNow() - loadStart;
688    LOG.info("Finished loading FSImage in " + timeTakenToLoadFSImage + " msecs");
689    NameNodeMetrics nnMetrics = NameNode.getNameNodeMetrics();
690    if (nnMetrics != null) {
691      nnMetrics.setFsImageLoadTime((int) timeTakenToLoadFSImage);
692    }
693    return namesystem;
694  }
695  
696  FSNamesystem(Configuration conf, FSImage fsImage) throws IOException {
697    this(conf, fsImage, false);
698  }
699  
700  /**
701   * Create an FSNamesystem associated with the specified image.
702   * 
703   * Note that this does not load any data off of disk -- if you would
704   * like that behavior, use {@link #loadFromDisk(Configuration)}
705   *
706   * @param conf configuration
707   * @param fsImage The FSImage to associate with
708   * @param ignoreRetryCache Whether or not should ignore the retry cache setup
709   *                         step. For Secondary NN this should be set to true.
710   * @throws IOException on bad configuration
711   */
712  FSNamesystem(Configuration conf, FSImage fsImage, boolean ignoreRetryCache)
713      throws IOException {
714    provider = DFSUtil.createKeyProviderCryptoExtension(conf);
715    if (provider == null) {
716      LOG.info("No KeyProvider found.");
717    } else {
718      LOG.info("Found KeyProvider: " + provider.toString());
719    }
720    if (conf.getBoolean(DFS_NAMENODE_AUDIT_LOG_ASYNC_KEY,
721                        DFS_NAMENODE_AUDIT_LOG_ASYNC_DEFAULT)) {
722      LOG.info("Enabling async auditlog");
723      enableAsyncAuditLog();
724    }
725    boolean fair = conf.getBoolean("dfs.namenode.fslock.fair", true);
726    LOG.info("fsLock is fair:" + fair);
727    fsLock = new FSNamesystemLock(fair);
728    cond = fsLock.writeLock().newCondition();
729    cpLock = new ReentrantLock();
730
731    this.fsImage = fsImage;
732    try {
733      resourceRecheckInterval = conf.getLong(
734          DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY,
735          DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT);
736
737      this.blockManager = new BlockManager(this, conf);
738      this.datanodeStatistics = blockManager.getDatanodeManager().getDatanodeStatistics();
739      this.blockIdManager = new BlockIdManager(blockManager);
740
741      this.fsOwner = UserGroupInformation.getCurrentUser();
742      this.supergroup = conf.get(DFS_PERMISSIONS_SUPERUSERGROUP_KEY, 
743                                 DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT);
744      this.isPermissionEnabled = conf.getBoolean(DFS_PERMISSIONS_ENABLED_KEY,
745                                                 DFS_PERMISSIONS_ENABLED_DEFAULT);
746      LOG.info("fsOwner             = " + fsOwner);
747      LOG.info("supergroup          = " + supergroup);
748      LOG.info("isPermissionEnabled = " + isPermissionEnabled);
749
750      // block allocation has to be persisted in HA using a shared edits directory
751      // so that the standby has up-to-date namespace information
752      nameserviceId = DFSUtil.getNamenodeNameServiceId(conf);
753      this.haEnabled = HAUtil.isHAEnabled(conf, nameserviceId);  
754      
755      // Sanity check the HA-related config.
756      if (nameserviceId != null) {
757        LOG.info("Determined nameservice ID: " + nameserviceId);
758      }
759      LOG.info("HA Enabled: " + haEnabled);
760      if (!haEnabled && HAUtil.usesSharedEditsDir(conf)) {
761        LOG.warn("Configured NNs:\n" + DFSUtil.nnAddressesAsString(conf));
762        throw new IOException("Invalid configuration: a shared edits dir " +
763            "must not be specified if HA is not enabled.");
764      }
765
766      // Get the checksum type from config
767      String checksumTypeStr = conf.get(DFS_CHECKSUM_TYPE_KEY, DFS_CHECKSUM_TYPE_DEFAULT);
768      DataChecksum.Type checksumType;
769      try {
770         checksumType = DataChecksum.Type.valueOf(checksumTypeStr);
771      } catch (IllegalArgumentException iae) {
772         throw new IOException("Invalid checksum type in "
773            + DFS_CHECKSUM_TYPE_KEY + ": " + checksumTypeStr);
774      }
775
776      this.serverDefaults = new FsServerDefaults(
777          conf.getLongBytes(DFS_BLOCK_SIZE_KEY, DFS_BLOCK_SIZE_DEFAULT),
778          conf.getInt(DFS_BYTES_PER_CHECKSUM_KEY, DFS_BYTES_PER_CHECKSUM_DEFAULT),
779          conf.getInt(DFS_CLIENT_WRITE_PACKET_SIZE_KEY, DFS_CLIENT_WRITE_PACKET_SIZE_DEFAULT),
780          (short) conf.getInt(DFS_REPLICATION_KEY, DFS_REPLICATION_DEFAULT),
781          conf.getInt(IO_FILE_BUFFER_SIZE_KEY, IO_FILE_BUFFER_SIZE_DEFAULT),
782          conf.getBoolean(DFS_ENCRYPT_DATA_TRANSFER_KEY, DFS_ENCRYPT_DATA_TRANSFER_DEFAULT),
783          conf.getLong(FS_TRASH_INTERVAL_KEY, FS_TRASH_INTERVAL_DEFAULT),
784          checksumType);
785      
786      this.maxFsObjects = conf.getLong(DFS_NAMENODE_MAX_OBJECTS_KEY, 
787                                       DFS_NAMENODE_MAX_OBJECTS_DEFAULT);
788
789      this.minBlockSize = conf.getLong(DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY,
790          DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_DEFAULT);
791      this.maxBlocksPerFile = conf.getLong(DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_KEY,
792          DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_DEFAULT);
793      this.accessTimePrecision = conf.getLong(DFS_NAMENODE_ACCESSTIME_PRECISION_KEY,
794          DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT);
795      this.supportAppends = conf.getBoolean(DFS_SUPPORT_APPEND_KEY, DFS_SUPPORT_APPEND_DEFAULT);
796      LOG.info("Append Enabled: " + supportAppends);
797
798      this.dtpReplaceDatanodeOnFailure = ReplaceDatanodeOnFailure.get(conf);
799      
800      this.standbyShouldCheckpoint = conf.getBoolean(
801          DFS_HA_STANDBY_CHECKPOINTS_KEY, DFS_HA_STANDBY_CHECKPOINTS_DEFAULT);
802      // # edit autoroll threshold is a multiple of the checkpoint threshold 
803      this.editLogRollerThreshold = (long)
804          (conf.getFloat(
805              DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD,
806              DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD_DEFAULT) *
807          conf.getLong(
808              DFS_NAMENODE_CHECKPOINT_TXNS_KEY,
809              DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT));
810      this.editLogRollerInterval = conf.getInt(
811          DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS,
812          DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS_DEFAULT);
813
814      this.lazyPersistFileScrubIntervalSec = conf.getInt(
815          DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC,
816          DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC_DEFAULT);
817
818      if (this.lazyPersistFileScrubIntervalSec == 0) {
819        throw new IllegalArgumentException(
820            DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC + " must be non-zero.");
821      }
822
823      // For testing purposes, allow the DT secret manager to be started regardless
824      // of whether security is enabled.
825      alwaysUseDelegationTokensForTests = conf.getBoolean(
826          DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY,
827          DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT);
828      
829      this.dtSecretManager = createDelegationTokenSecretManager(conf);
830      this.dir = new FSDirectory(this, conf);
831      this.snapshotManager = new SnapshotManager(dir);
832      this.cacheManager = new CacheManager(this, conf, blockManager);
833      this.safeMode = new SafeModeInfo(conf);
834      this.topConf = new TopConf(conf);
835      this.auditLoggers = initAuditLoggers(conf);
836      this.isDefaultAuditLogger = auditLoggers.size() == 1 &&
837        auditLoggers.get(0) instanceof DefaultAuditLogger;
838      this.retryCache = ignoreRetryCache ? null : initRetryCache(conf);
839      Class<? extends INodeAttributeProvider> klass = conf.getClass(
840          DFS_NAMENODE_INODE_ATTRIBUTES_PROVIDER_KEY,
841          null, INodeAttributeProvider.class);
842      if (klass != null) {
843        inodeAttributeProvider = ReflectionUtils.newInstance(klass, conf);
844        LOG.info("Using INode attribute provider: " + klass.getName());
845      }
846    } catch(IOException e) {
847      LOG.error(getClass().getSimpleName() + " initialization failed.", e);
848      close();
849      throw e;
850    } catch (RuntimeException re) {
851      LOG.error(getClass().getSimpleName() + " initialization failed.", re);
852      close();
853      throw re;
854    }
855  }
856
857  @VisibleForTesting
858  public List<AuditLogger> getAuditLoggers() {
859    return auditLoggers;
860  }
861
862  @VisibleForTesting
863  public RetryCache getRetryCache() {
864    return retryCache;
865  }
866
867  void lockRetryCache() {
868    if (retryCache != null) {
869      retryCache.lock();
870    }
871  }
872
873  void unlockRetryCache() {
874    if (retryCache != null) {
875      retryCache.unlock();
876    }
877  }
878
879  /** Whether or not retry cache is enabled */
880  boolean hasRetryCache() {
881    return retryCache != null;
882  }
883  
884  void addCacheEntryWithPayload(byte[] clientId, int callId, Object payload) {
885    if (retryCache != null) {
886      retryCache.addCacheEntryWithPayload(clientId, callId, payload);
887    }
888  }
889  
890  void addCacheEntry(byte[] clientId, int callId) {
891    if (retryCache != null) {
892      retryCache.addCacheEntry(clientId, callId);
893    }
894  }
895
896  @VisibleForTesting
897  public KeyProviderCryptoExtension getProvider() {
898    return provider;
899  }
900
901  @VisibleForTesting
902  static RetryCache initRetryCache(Configuration conf) {
903    boolean enable = conf.getBoolean(DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY,
904                                     DFS_NAMENODE_ENABLE_RETRY_CACHE_DEFAULT);
905    LOG.info("Retry cache on namenode is " + (enable ? "enabled" : "disabled"));
906    if (enable) {
907      float heapPercent = conf.getFloat(
908          DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_KEY,
909          DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_DEFAULT);
910      long entryExpiryMillis = conf.getLong(
911          DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_KEY,
912          DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_DEFAULT);
913      LOG.info("Retry cache will use " + heapPercent
914          + " of total heap and retry cache entry expiry time is "
915          + entryExpiryMillis + " millis");
916      long entryExpiryNanos = entryExpiryMillis * 1000 * 1000;
917      return new RetryCache("NameNodeRetryCache", heapPercent,
918          entryExpiryNanos);
919    }
920    return null;
921  }
922
923  private List<AuditLogger> initAuditLoggers(Configuration conf) {
924    // Initialize the custom access loggers if configured.
925    Collection<String> alClasses = conf.getStringCollection(DFS_NAMENODE_AUDIT_LOGGERS_KEY);
926    List<AuditLogger> auditLoggers = Lists.newArrayList();
927    if (alClasses != null && !alClasses.isEmpty()) {
928      for (String className : alClasses) {
929        try {
930          AuditLogger logger;
931          if (DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME.equals(className)) {
932            logger = new DefaultAuditLogger();
933          } else {
934            logger = (AuditLogger) Class.forName(className).newInstance();
935          }
936          logger.initialize(conf);
937          auditLoggers.add(logger);
938        } catch (RuntimeException re) {
939          throw re;
940        } catch (Exception e) {
941          throw new RuntimeException(e);
942        }
943      }
944    }
945
946    // Make sure there is at least one logger installed.
947    if (auditLoggers.isEmpty()) {
948      auditLoggers.add(new DefaultAuditLogger());
949    }
950
951    // Add audit logger to calculate top users
952    if (topConf.isEnabled) {
953      topMetrics = new TopMetrics(conf, topConf.nntopReportingPeriodsMs);
954      auditLoggers.add(new TopAuditLogger(topMetrics));
955    }
956
957    return Collections.unmodifiableList(auditLoggers);
958  }
959
960  private void loadFSImage(StartupOption startOpt) throws IOException {
961    final FSImage fsImage = getFSImage();
962
963    // format before starting up if requested
964    if (startOpt == StartupOption.FORMAT) {
965      
966      fsImage.format(this, fsImage.getStorage().determineClusterId());// reuse current id
967
968      startOpt = StartupOption.REGULAR;
969    }
970    boolean success = false;
971    writeLock();
972    try {
973      // We shouldn't be calling saveNamespace if we've come up in standby state.
974      MetaRecoveryContext recovery = startOpt.createRecoveryContext();
975      final boolean staleImage
976          = fsImage.recoverTransitionRead(startOpt, this, recovery);
977      if (RollingUpgradeStartupOption.ROLLBACK.matches(startOpt) ||
978          RollingUpgradeStartupOption.DOWNGRADE.matches(startOpt)) {
979        rollingUpgradeInfo = null;
980      }
981      final boolean needToSave = staleImage && !haEnabled && !isRollingUpgrade(); 
982      LOG.info("Need to save fs image? " + needToSave
983          + " (staleImage=" + staleImage + ", haEnabled=" + haEnabled
984          + ", isRollingUpgrade=" + isRollingUpgrade() + ")");
985      if (needToSave) {
986        fsImage.saveNamespace(this);
987      } else {
988        updateStorageVersionForRollingUpgrade(fsImage.getLayoutVersion(),
989            startOpt);
990        // No need to save, so mark the phase done.
991        StartupProgress prog = NameNode.getStartupProgress();
992        prog.beginPhase(Phase.SAVING_CHECKPOINT);
993        prog.endPhase(Phase.SAVING_CHECKPOINT);
994      }
995      // This will start a new log segment and write to the seen_txid file, so
996      // we shouldn't do it when coming up in standby state
997      if (!haEnabled || (haEnabled && startOpt == StartupOption.UPGRADE)
998          || (haEnabled && startOpt == StartupOption.UPGRADEONLY)) {
999        fsImage.openEditLogForWrite();
1000      }
1001      success = true;
1002    } finally {
1003      if (!success) {
1004        fsImage.close();
1005      }
1006      writeUnlock();
1007    }
1008    imageLoadComplete();
1009  }
1010
1011  private void updateStorageVersionForRollingUpgrade(final long layoutVersion,
1012      StartupOption startOpt) throws IOException {
1013    boolean rollingStarted = RollingUpgradeStartupOption.STARTED
1014        .matches(startOpt) && layoutVersion > HdfsConstants
1015        .NAMENODE_LAYOUT_VERSION;
1016    boolean rollingRollback = RollingUpgradeStartupOption.ROLLBACK
1017        .matches(startOpt);
1018    if (rollingRollback || rollingStarted) {
1019      fsImage.updateStorageVersion();
1020    }
1021  }
1022
1023  private void startSecretManager() {
1024    if (dtSecretManager != null) {
1025      try {
1026        dtSecretManager.startThreads();
1027      } catch (IOException e) {
1028        // Inability to start secret manager
1029        // can't be recovered from.
1030        throw new RuntimeException(e);
1031      }
1032    }
1033  }
1034  
1035  private void startSecretManagerIfNecessary() {
1036    boolean shouldRun = shouldUseDelegationTokens() &&
1037      !isInSafeMode() && getEditLog().isOpenForWrite();
1038    boolean running = dtSecretManager.isRunning();
1039    if (shouldRun && !running) {
1040      startSecretManager();
1041    }
1042  }
1043
1044  private void stopSecretManager() {
1045    if (dtSecretManager != null) {
1046      dtSecretManager.stopThreads();
1047    }
1048  }
1049  
1050  /** 
1051   * Start services common to both active and standby states
1052   */
1053  void startCommonServices(Configuration conf, HAContext haContext) throws IOException {
1054    this.registerMBean(); // register the MBean for the FSNamesystemState
1055    writeLock();
1056    this.haContext = haContext;
1057    try {
1058      nnResourceChecker = new NameNodeResourceChecker(conf);
1059      checkAvailableResources();
1060      assert safeMode != null && !isPopulatingReplQueues();
1061      StartupProgress prog = NameNode.getStartupProgress();
1062      prog.beginPhase(Phase.SAFEMODE);
1063      prog.setTotal(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS,
1064        getCompleteBlocksTotal());
1065      setBlockTotal();
1066      blockManager.activate(conf);
1067    } finally {
1068      writeUnlock();
1069    }
1070    
1071    registerMXBean();
1072    DefaultMetricsSystem.instance().register(this);
1073    if (inodeAttributeProvider != null) {
1074      inodeAttributeProvider.start();
1075      dir.setINodeAttributeProvider(inodeAttributeProvider);
1076    }
1077    snapshotManager.registerMXBean();
1078  }
1079  
1080  /** 
1081   * Stop services common to both active and standby states
1082   */
1083  void stopCommonServices() {
1084    writeLock();
1085    if (inodeAttributeProvider != null) {
1086      dir.setINodeAttributeProvider(null);
1087      inodeAttributeProvider.stop();
1088    }
1089    try {
1090      if (blockManager != null) blockManager.close();
1091    } finally {
1092      writeUnlock();
1093    }
1094    RetryCache.clear(retryCache);
1095  }
1096  
1097  /**
1098   * Start services required in active state
1099   * @throws IOException
1100   */
1101  void startActiveServices() throws IOException {
1102    startingActiveService = true;
1103    LOG.info("Starting services required for active state");
1104    writeLock();
1105    try {
1106      FSEditLog editLog = getFSImage().getEditLog();
1107      
1108      if (!editLog.isOpenForWrite()) {
1109        // During startup, we're already open for write during initialization.
1110        editLog.initJournalsForWrite();
1111        // May need to recover
1112        editLog.recoverUnclosedStreams();
1113        
1114        LOG.info("Catching up to latest edits from old active before " +
1115            "taking over writer role in edits logs");
1116        editLogTailer.catchupDuringFailover();
1117        
1118        blockManager.setPostponeBlocksFromFuture(false);
1119        blockManager.getDatanodeManager().markAllDatanodesStale();
1120        blockManager.clearQueues();
1121        blockManager.processAllPendingDNMessages();
1122
1123        // Only need to re-process the queue, If not in SafeMode.
1124        if (!isInSafeMode()) {
1125          LOG.info("Reprocessing replication and invalidation queues");
1126          initializeReplQueues();
1127        }
1128
1129        if (LOG.isDebugEnabled()) {
1130          LOG.debug("NameNode metadata after re-processing " +
1131              "replication and invalidation queues during failover:\n" +
1132              metaSaveAsString());
1133        }
1134        
1135        long nextTxId = getFSImage().getLastAppliedTxId() + 1;
1136        LOG.info("Will take over writing edit logs at txnid " + 
1137            nextTxId);
1138        editLog.setNextTxId(nextTxId);
1139
1140        getFSImage().editLog.openForWrite();
1141      }
1142
1143      // Enable quota checks.
1144      dir.enableQuotaChecks();
1145      if (haEnabled) {
1146        // Renew all of the leases before becoming active.
1147        // This is because, while we were in standby mode,
1148        // the leases weren't getting renewed on this NN.
1149        // Give them all a fresh start here.
1150        leaseManager.renewAllLeases();
1151      }
1152      leaseManager.startMonitor();
1153      startSecretManagerIfNecessary();
1154
1155      //ResourceMonitor required only at ActiveNN. See HDFS-2914
1156      this.nnrmthread = new Daemon(new NameNodeResourceMonitor());
1157      nnrmthread.start();
1158
1159      nnEditLogRoller = new Daemon(new NameNodeEditLogRoller(
1160          editLogRollerThreshold, editLogRollerInterval));
1161      nnEditLogRoller.start();
1162
1163      if (lazyPersistFileScrubIntervalSec > 0) {
1164        lazyPersistFileScrubber = new Daemon(new LazyPersistFileScrubber(
1165            lazyPersistFileScrubIntervalSec));
1166        lazyPersistFileScrubber.start();
1167      }
1168
1169      cacheManager.startMonitorThread();
1170      blockManager.getDatanodeManager().setShouldSendCachingCommands(true);
1171    } finally {
1172      startingActiveService = false;
1173      checkSafeMode();
1174      writeUnlock();
1175    }
1176  }
1177
1178  /**
1179   * Initialize replication queues.
1180   */
1181  private void initializeReplQueues() {
1182    LOG.info("initializing replication queues");
1183    blockManager.processMisReplicatedBlocks();
1184    initializedReplQueues = true;
1185  }
1186
1187  private boolean inActiveState() {
1188    return haContext != null &&
1189        haContext.getState().getServiceState() == HAServiceState.ACTIVE;
1190  }
1191
1192  /**
1193   * @return Whether the namenode is transitioning to active state and is in the
1194   *         middle of the {@link #startActiveServices()}
1195   */
1196  public boolean inTransitionToActive() {
1197    return haEnabled && inActiveState() && startingActiveService;
1198  }
1199
1200  private boolean shouldUseDelegationTokens() {
1201    return UserGroupInformation.isSecurityEnabled() ||
1202      alwaysUseDelegationTokensForTests;
1203  }
1204
1205  /** 
1206   * Stop services required in active state
1207   */
1208  void stopActiveServices() {
1209    LOG.info("Stopping services started for active state");
1210    writeLock();
1211    try {
1212      stopSecretManager();
1213      leaseManager.stopMonitor();
1214      if (nnrmthread != null) {
1215        ((NameNodeResourceMonitor) nnrmthread.getRunnable()).stopMonitor();
1216        nnrmthread.interrupt();
1217      }
1218      if (nnEditLogRoller != null) {
1219        ((NameNodeEditLogRoller)nnEditLogRoller.getRunnable()).stop();
1220        nnEditLogRoller.interrupt();
1221      }
1222      if (lazyPersistFileScrubber != null) {
1223        ((LazyPersistFileScrubber) lazyPersistFileScrubber.getRunnable()).stop();
1224        lazyPersistFileScrubber.interrupt();
1225      }
1226      if (dir != null && getFSImage() != null) {
1227        if (getFSImage().editLog != null) {
1228          getFSImage().editLog.close();
1229        }
1230        // Update the fsimage with the last txid that we wrote
1231        // so that the tailer starts from the right spot.
1232        getFSImage().updateLastAppliedTxIdFromWritten();
1233      }
1234      if (cacheManager != null) {
1235        cacheManager.stopMonitorThread();
1236        cacheManager.clearDirectiveStats();
1237      }
1238      blockManager.getDatanodeManager().clearPendingCachingCommands();
1239      blockManager.getDatanodeManager().setShouldSendCachingCommands(false);
1240      // Don't want to keep replication queues when not in Active.
1241      blockManager.clearQueues();
1242      initializedReplQueues = false;
1243    } finally {
1244      writeUnlock();
1245    }
1246  }
1247  
1248  /**
1249   * Start services required in standby state 
1250   * 
1251   * @throws IOException
1252   */
1253  void startStandbyServices(final Configuration conf) throws IOException {
1254    LOG.info("Starting services required for standby state");
1255    if (!getFSImage().editLog.isOpenForRead()) {
1256      // During startup, we're already open for read.
1257      getFSImage().editLog.initSharedJournalsForRead();
1258    }
1259    
1260    blockManager.setPostponeBlocksFromFuture(true);
1261
1262    // Disable quota checks while in standby.
1263    dir.disableQuotaChecks();
1264    editLogTailer = new EditLogTailer(this, conf);
1265    editLogTailer.start();
1266    if (standbyShouldCheckpoint) {
1267      standbyCheckpointer = new StandbyCheckpointer(conf, this);
1268      standbyCheckpointer.start();
1269    }
1270  }
1271
1272  /**
1273   * Called when the NN is in Standby state and the editlog tailer tails the
1274   * OP_ROLLING_UPGRADE_START.
1275   */
1276  void triggerRollbackCheckpoint() {
1277    setNeedRollbackFsImage(true);
1278    if (standbyCheckpointer != null) {
1279      standbyCheckpointer.triggerRollbackCheckpoint();
1280    }
1281  }
1282
1283  /**
1284   * Called while the NN is in Standby state, but just about to be
1285   * asked to enter Active state. This cancels any checkpoints
1286   * currently being taken.
1287   */
1288  void prepareToStopStandbyServices() throws ServiceFailedException {
1289    if (standbyCheckpointer != null) {
1290      standbyCheckpointer.cancelAndPreventCheckpoints(
1291          "About to leave standby state");
1292    }
1293  }
1294
1295  /** Stop services required in standby state */
1296  void stopStandbyServices() throws IOException {
1297    LOG.info("Stopping services started for standby state");
1298    if (standbyCheckpointer != null) {
1299      standbyCheckpointer.stop();
1300    }
1301    if (editLogTailer != null) {
1302      editLogTailer.stop();
1303    }
1304    if (dir != null && getFSImage() != null && getFSImage().editLog != null) {
1305      getFSImage().editLog.close();
1306    }
1307  }
1308  
1309  @Override
1310  public void checkOperation(OperationCategory op) throws StandbyException {
1311    if (haContext != null) {
1312      // null in some unit tests
1313      haContext.checkOperation(op);
1314    }
1315  }
1316  
1317  /**
1318   * @throws RetriableException
1319   *           If 1) The NameNode is in SafeMode, 2) HA is enabled, and 3)
1320   *           NameNode is in active state
1321   * @throws SafeModeException
1322   *           Otherwise if NameNode is in SafeMode.
1323   */
1324  void checkNameNodeSafeMode(String errorMsg)
1325      throws RetriableException, SafeModeException {
1326    if (isInSafeMode()) {
1327      SafeModeException se = new SafeModeException(errorMsg, safeMode);
1328      if (haEnabled && haContext != null
1329          && haContext.getState().getServiceState() == HAServiceState.ACTIVE
1330          && shouldRetrySafeMode(this.safeMode)) {
1331        throw new RetriableException(se);
1332      } else {
1333        throw se;
1334      }
1335    }
1336  }
1337
1338  boolean isPermissionEnabled() {
1339    return isPermissionEnabled;
1340  }
1341
1342  /**
1343   * We already know that the safemode is on. We will throw a RetriableException
1344   * if the safemode is not manual or caused by low resource.
1345   */
1346  private boolean shouldRetrySafeMode(SafeModeInfo safeMode) {
1347    if (safeMode == null) {
1348      return false;
1349    } else {
1350      return !safeMode.isManual() && !safeMode.areResourcesLow();
1351    }
1352  }
1353  
1354  public static Collection<URI> getNamespaceDirs(Configuration conf) {
1355    return getStorageDirs(conf, DFS_NAMENODE_NAME_DIR_KEY);
1356  }
1357
1358  /**
1359   * Get all edits dirs which are required. If any shared edits dirs are
1360   * configured, these are also included in the set of required dirs.
1361   * 
1362   * @param conf the HDFS configuration.
1363   * @return all required dirs.
1364   */
1365  public static Collection<URI> getRequiredNamespaceEditsDirs(Configuration conf) {
1366    Set<URI> ret = new HashSet<URI>();
1367    ret.addAll(getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY));
1368    ret.addAll(getSharedEditsDirs(conf));
1369    return ret;
1370  }
1371
1372  private static Collection<URI> getStorageDirs(Configuration conf,
1373                                                String propertyName) {
1374    Collection<String> dirNames = conf.getTrimmedStringCollection(propertyName);
1375    StartupOption startOpt = NameNode.getStartupOption(conf);
1376    if(startOpt == StartupOption.IMPORT) {
1377      // In case of IMPORT this will get rid of default directories 
1378      // but will retain directories specified in hdfs-site.xml
1379      // When importing image from a checkpoint, the name-node can
1380      // start with empty set of storage directories.
1381      Configuration cE = new HdfsConfiguration(false);
1382      cE.addResource("core-default.xml");
1383      cE.addResource("core-site.xml");
1384      cE.addResource("hdfs-default.xml");
1385      Collection<String> dirNames2 = cE.getTrimmedStringCollection(propertyName);
1386      dirNames.removeAll(dirNames2);
1387      if(dirNames.isEmpty())
1388        LOG.warn("!!! WARNING !!!" +
1389          "\n\tThe NameNode currently runs without persistent storage." +
1390          "\n\tAny changes to the file system meta-data may be lost." +
1391          "\n\tRecommended actions:" +
1392          "\n\t\t- shutdown and restart NameNode with configured \"" 
1393          + propertyName + "\" in hdfs-site.xml;" +
1394          "\n\t\t- use Backup Node as a persistent and up-to-date storage " +
1395          "of the file system meta-data.");
1396    } else if (dirNames.isEmpty()) {
1397      dirNames = Collections.singletonList(
1398          DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_DEFAULT);
1399    }
1400    return Util.stringCollectionAsURIs(dirNames);
1401  }
1402
1403  /**
1404   * Return an ordered list of edits directories to write to.
1405   * The list is ordered such that all shared edits directories
1406   * are ordered before non-shared directories, and any duplicates
1407   * are removed. The order they are specified in the configuration
1408   * is retained.
1409   * @return Collection of shared edits directories.
1410   * @throws IOException if multiple shared edits directories are configured
1411   */
1412  public static List<URI> getNamespaceEditsDirs(Configuration conf)
1413      throws IOException {
1414    return getNamespaceEditsDirs(conf, true);
1415  }
1416  
1417  public static List<URI> getNamespaceEditsDirs(Configuration conf,
1418      boolean includeShared)
1419      throws IOException {
1420    // Use a LinkedHashSet so that order is maintained while we de-dup
1421    // the entries.
1422    LinkedHashSet<URI> editsDirs = new LinkedHashSet<URI>();
1423    
1424    if (includeShared) {
1425      List<URI> sharedDirs = getSharedEditsDirs(conf);
1426  
1427      // Fail until multiple shared edits directories are supported (HDFS-2782)
1428      if (sharedDirs.size() > 1) {
1429        throw new IOException(
1430            "Multiple shared edits directories are not yet supported");
1431      }
1432  
1433      // First add the shared edits dirs. It's critical that the shared dirs
1434      // are added first, since JournalSet syncs them in the order they are listed,
1435      // and we need to make sure all edits are in place in the shared storage
1436      // before they are replicated locally. See HDFS-2874.
1437      for (URI dir : sharedDirs) {
1438        if (!editsDirs.add(dir)) {
1439          LOG.warn("Edits URI " + dir + " listed multiple times in " + 
1440              DFS_NAMENODE_SHARED_EDITS_DIR_KEY + ". Ignoring duplicates.");
1441        }
1442      }
1443    }    
1444    // Now add the non-shared dirs.
1445    for (URI dir : getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_KEY)) {
1446      if (!editsDirs.add(dir)) {
1447        LOG.warn("Edits URI " + dir + " listed multiple times in " + 
1448            DFS_NAMENODE_SHARED_EDITS_DIR_KEY + " and " +
1449            DFS_NAMENODE_EDITS_DIR_KEY + ". Ignoring duplicates.");
1450      }
1451    }
1452
1453    if (editsDirs.isEmpty()) {
1454      // If this is the case, no edit dirs have been explicitly configured.
1455      // Image dirs are to be used for edits too.
1456      return Lists.newArrayList(getNamespaceDirs(conf));
1457    } else {
1458      return Lists.newArrayList(editsDirs);
1459    }
1460  }
1461  
1462  /**
1463   * Returns edit directories that are shared between primary and secondary.
1464   * @param conf configuration
1465   * @return collection of edit directories from {@code conf}
1466   */
1467  public static List<URI> getSharedEditsDirs(Configuration conf) {
1468    // don't use getStorageDirs here, because we want an empty default
1469    // rather than the dir in /tmp
1470    Collection<String> dirNames = conf.getTrimmedStringCollection(
1471        DFS_NAMENODE_SHARED_EDITS_DIR_KEY);
1472    return Util.stringCollectionAsURIs(dirNames);
1473  }
1474
1475  @Override
1476  public void readLock() {
1477    this.fsLock.readLock().lock();
1478  }
1479  @Override
1480  public void readUnlock() {
1481    this.fsLock.readLock().unlock();
1482  }
1483  @Override
1484  public void writeLock() {
1485    this.fsLock.writeLock().lock();
1486  }
1487  @Override
1488  public void writeLockInterruptibly() throws InterruptedException {
1489    this.fsLock.writeLock().lockInterruptibly();
1490  }
1491  @Override
1492  public void writeUnlock() {
1493    this.fsLock.writeLock().unlock();
1494  }
1495  @Override
1496  public boolean hasWriteLock() {
1497    return this.fsLock.isWriteLockedByCurrentThread();
1498  }
1499  @Override
1500  public boolean hasReadLock() {
1501    return this.fsLock.getReadHoldCount() > 0 || hasWriteLock();
1502  }
1503
1504  public int getReadHoldCount() {
1505    return this.fsLock.getReadHoldCount();
1506  }
1507
1508  public int getWriteHoldCount() {
1509    return this.fsLock.getWriteHoldCount();
1510  }
1511
1512  /** Lock the checkpoint lock */
1513  public void cpLock() {
1514    this.cpLock.lock();
1515  }
1516
1517  /** Lock the checkpoint lock interrupibly */
1518  public void cpLockInterruptibly() throws InterruptedException {
1519    this.cpLock.lockInterruptibly();
1520  }
1521
1522  /** Unlock the checkpoint lock */
1523  public void cpUnlock() {
1524    this.cpLock.unlock();
1525  }
1526    
1527
1528  NamespaceInfo getNamespaceInfo() {
1529    readLock();
1530    try {
1531      return unprotectedGetNamespaceInfo();
1532    } finally {
1533      readUnlock();
1534    }
1535  }
1536
1537  /**
1538   * Version of @see #getNamespaceInfo() that is not protected by a lock.
1539   */
1540  NamespaceInfo unprotectedGetNamespaceInfo() {
1541    return new NamespaceInfo(getFSImage().getStorage().getNamespaceID(),
1542        getClusterId(), getBlockPoolId(),
1543        getFSImage().getStorage().getCTime());
1544  }
1545
1546  /**
1547   * Close down this file system manager.
1548   * Causes heartbeat and lease daemons to stop; waits briefly for
1549   * them to finish, but a short timeout returns control back to caller.
1550   */
1551  void close() {
1552    fsRunning = false;
1553    try {
1554      stopCommonServices();
1555      if (smmthread != null) smmthread.interrupt();
1556    } finally {
1557      // using finally to ensure we also wait for lease daemon
1558      try {
1559        stopActiveServices();
1560        stopStandbyServices();
1561      } catch (IOException ie) {
1562      } finally {
1563        IOUtils.cleanup(LOG, dir);
1564        IOUtils.cleanup(LOG, fsImage);
1565      }
1566    }
1567  }
1568
1569  @Override
1570  public boolean isRunning() {
1571    return fsRunning;
1572  }
1573  
1574  @Override
1575  public boolean isInStandbyState() {
1576    if (haContext == null || haContext.getState() == null) {
1577      // We're still starting up. In this case, if HA is
1578      // on for the cluster, we always start in standby. Otherwise
1579      // start in active.
1580      return haEnabled;
1581    }
1582
1583    return HAServiceState.STANDBY == haContext.getState().getServiceState();
1584  }
1585
1586  /**
1587   * Dump all metadata into specified file
1588   */
1589  void metaSave(String filename) throws IOException {
1590    checkSuperuserPrivilege();
1591    checkOperation(OperationCategory.UNCHECKED);
1592    writeLock();
1593    try {
1594      checkOperation(OperationCategory.UNCHECKED);
1595      File file = new File(System.getProperty("hadoop.log.dir"), filename);
1596      PrintWriter out = new PrintWriter(new BufferedWriter(
1597          new OutputStreamWriter(new FileOutputStream(file), Charsets.UTF_8)));
1598      metaSave(out);
1599      out.flush();
1600      out.close();
1601    } finally {
1602      writeUnlock();
1603    }
1604  }
1605
1606  private void metaSave(PrintWriter out) {
1607    assert hasWriteLock();
1608    long totalInodes = this.dir.totalInodes();
1609    long totalBlocks = this.getBlocksTotal();
1610    out.println(totalInodes + " files and directories, " + totalBlocks
1611        + " blocks = " + (totalInodes + totalBlocks) + " total");
1612
1613    blockManager.metaSave(out);
1614  }
1615
1616  private String metaSaveAsString() {
1617    StringWriter sw = new StringWriter();
1618    PrintWriter pw = new PrintWriter(sw);
1619    metaSave(pw);
1620    pw.flush();
1621    return sw.toString();
1622  }
1623
1624  FsServerDefaults getServerDefaults() throws StandbyException {
1625    checkOperation(OperationCategory.READ);
1626    return serverDefaults;
1627  }
1628
1629  long getAccessTimePrecision() {
1630    return accessTimePrecision;
1631  }
1632
1633  private boolean isAccessTimeSupported() {
1634    return accessTimePrecision > 0;
1635  }
1636
1637  /////////////////////////////////////////////////////////
1638  //
1639  // These methods are called by HadoopFS clients
1640  //
1641  /////////////////////////////////////////////////////////
1642  /**
1643   * Set permissions for an existing file.
1644   * @throws IOException
1645   */
1646  void setPermission(String src, FsPermission permission) throws IOException {
1647    HdfsFileStatus auditStat;
1648    checkOperation(OperationCategory.WRITE);
1649    writeLock();
1650    try {
1651      checkOperation(OperationCategory.WRITE);
1652      checkNameNodeSafeMode("Cannot set permission for " + src);
1653      auditStat = FSDirAttrOp.setPermission(dir, src, permission);
1654    } catch (AccessControlException e) {
1655      logAuditEvent(false, "setPermission", src);
1656      throw e;
1657    } finally {
1658      writeUnlock();
1659    }
1660    getEditLog().logSync();
1661    logAuditEvent(true, "setPermission", src, null, auditStat);
1662  }
1663
1664  /**
1665   * Set owner for an existing file.
1666   * @throws IOException
1667   */
1668  void setOwner(String src, String username, String group)
1669      throws IOException {
1670    HdfsFileStatus auditStat;
1671    checkOperation(OperationCategory.WRITE);
1672    writeLock();
1673    try {
1674      checkOperation(OperationCategory.WRITE);
1675      checkNameNodeSafeMode("Cannot set owner for " + src);
1676      auditStat = FSDirAttrOp.setOwner(dir, src, username, group);
1677    } catch (AccessControlException e) {
1678      logAuditEvent(false, "setOwner", src);
1679      throw e;
1680    } finally {
1681      writeUnlock();
1682    }
1683    getEditLog().logSync();
1684    logAuditEvent(true, "setOwner", src, null, auditStat);
1685  }
1686
1687  static class GetBlockLocationsResult {
1688    final boolean updateAccessTime;
1689    final LocatedBlocks blocks;
1690    boolean updateAccessTime() {
1691      return updateAccessTime;
1692    }
1693    private GetBlockLocationsResult(
1694        boolean updateAccessTime, LocatedBlocks blocks) {
1695      this.updateAccessTime = updateAccessTime;
1696      this.blocks = blocks;
1697    }
1698  }
1699
1700  /**
1701   * Get block locations within the specified range.
1702   * @see ClientProtocol#getBlockLocations(String, long, long)
1703   */
1704  LocatedBlocks getBlockLocations(String clientMachine, String srcArg,
1705      long offset, long length) throws IOException {
1706    checkOperation(OperationCategory.READ);
1707    GetBlockLocationsResult res = null;
1708    FSPermissionChecker pc = getPermissionChecker();
1709    readLock();
1710    try {
1711      checkOperation(OperationCategory.READ);
1712      res = getBlockLocations(pc, srcArg, offset, length, true, true);
1713    } catch (AccessControlException e) {
1714      logAuditEvent(false, "open", srcArg);
1715      throw e;
1716    } finally {
1717      readUnlock();
1718    }
1719
1720    logAuditEvent(true, "open", srcArg);
1721
1722    if (res.updateAccessTime()) {
1723      byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(
1724          srcArg);
1725      String src = srcArg;
1726      writeLock();
1727      final long now = now();
1728      try {
1729        checkOperation(OperationCategory.WRITE);
1730        /**
1731         * Resolve the path again and update the atime only when the file
1732         * exists.
1733         *
1734         * XXX: Races can still occur even after resolving the path again.
1735         * For example:
1736         *
1737         * <ul>
1738         *   <li>Get the block location for "/a/b"</li>
1739         *   <li>Rename "/a/b" to "/c/b"</li>
1740         *   <li>The second resolution still points to "/a/b", which is
1741         *   wrong.</li>
1742         * </ul>
1743         *
1744         * The behavior is incorrect but consistent with the one before
1745         * HDFS-7463. A better fix is to change the edit log of SetTime to
1746         * use inode id instead of a path.
1747         */
1748        src = dir.resolvePath(pc, srcArg, pathComponents);
1749        final INodesInPath iip = dir.getINodesInPath(src, true);
1750        INode inode = iip.getLastINode();
1751        boolean updateAccessTime = inode != null &&
1752            now > inode.getAccessTime() + getAccessTimePrecision();
1753        if (!isInSafeMode() && updateAccessTime) {
1754          boolean changed = FSDirAttrOp.setTimes(dir,
1755              inode, -1, now, false, iip.getLatestSnapshotId());
1756          if (changed) {
1757            getEditLog().logTimes(src, -1, now);
1758          }
1759        }
1760      } catch (Throwable e) {
1761        LOG.warn("Failed to update the access time of " + src, e);
1762      } finally {
1763        writeUnlock();
1764      }
1765    }
1766
1767    LocatedBlocks blocks = res.blocks;
1768    if (blocks != null) {
1769      blockManager.getDatanodeManager().sortLocatedBlocks(
1770          clientMachine, blocks.getLocatedBlocks());
1771
1772      // lastBlock is not part of getLocatedBlocks(), might need to sort it too
1773      LocatedBlock lastBlock = blocks.getLastLocatedBlock();
1774      if (lastBlock != null) {
1775        ArrayList<LocatedBlock> lastBlockList = Lists.newArrayList(lastBlock);
1776        blockManager.getDatanodeManager().sortLocatedBlocks(
1777            clientMachine, lastBlockList);
1778      }
1779    }
1780    return blocks;
1781  }
1782
1783  /**
1784   * Get block locations within the specified range.
1785   * @see ClientProtocol#getBlockLocations(String, long, long)
1786   * @throws IOException
1787   */
1788  GetBlockLocationsResult getBlockLocations(
1789      FSPermissionChecker pc, String src, long offset, long length,
1790      boolean needBlockToken, boolean checkSafeMode) throws IOException {
1791    if (offset < 0) {
1792      throw new HadoopIllegalArgumentException(
1793          "Negative offset is not supported. File: " + src);
1794    }
1795    if (length < 0) {
1796      throw new HadoopIllegalArgumentException(
1797          "Negative length is not supported. File: " + src);
1798    }
1799    final GetBlockLocationsResult ret = getBlockLocationsInt(
1800        pc, src, offset, length, needBlockToken);
1801
1802    if (checkSafeMode && isInSafeMode()) {
1803      for (LocatedBlock b : ret.blocks.getLocatedBlocks()) {
1804        // if safemode & no block locations yet then throw safemodeException
1805        if ((b.getLocations() == null) || (b.getLocations().length == 0)) {
1806          SafeModeException se = new SafeModeException(
1807              "Zero blocklocations for " + src, safeMode);
1808          if (haEnabled && haContext != null &&
1809              haContext.getState().getServiceState() == HAServiceState.ACTIVE) {
1810            throw new RetriableException(se);
1811          } else {
1812            throw se;
1813          }
1814        }
1815      }
1816    }
1817    return ret;
1818  }
1819
1820  private GetBlockLocationsResult getBlockLocationsInt(
1821      FSPermissionChecker pc, final String srcArg, long offset, long length,
1822      boolean needBlockToken)
1823      throws IOException {
1824    String src = srcArg;
1825    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
1826    src = dir.resolvePath(pc, srcArg, pathComponents);
1827    final INodesInPath iip = dir.getINodesInPath(src, true);
1828    final INodeFile inode = INodeFile.valueOf(iip.getLastINode(), src);
1829    if (isPermissionEnabled) {
1830      dir.checkPathAccess(pc, iip, FsAction.READ);
1831      checkUnreadableBySuperuser(pc, inode, iip.getPathSnapshotId());
1832    }
1833
1834    final long fileSize = iip.isSnapshot()
1835        ? inode.computeFileSize(iip.getPathSnapshotId())
1836        : inode.computeFileSizeNotIncludingLastUcBlock();
1837    boolean isUc = inode.isUnderConstruction();
1838    if (iip.isSnapshot()) {
1839      // if src indicates a snapshot file, we need to make sure the returned
1840      // blocks do not exceed the size of the snapshot file.
1841      length = Math.min(length, fileSize - offset);
1842      isUc = false;
1843    }
1844
1845    final FileEncryptionInfo feInfo =
1846        FSDirectory.isReservedRawName(srcArg) ? null
1847            : dir.getFileEncryptionInfo(inode, iip.getPathSnapshotId(), iip);
1848
1849    final LocatedBlocks blocks = blockManager.createLocatedBlocks(
1850        inode.getBlocks(iip.getPathSnapshotId()), fileSize,
1851        isUc, offset, length, needBlockToken, iip.isSnapshot(), feInfo);
1852
1853    // Set caching information for the located blocks.
1854    for (LocatedBlock lb : blocks.getLocatedBlocks()) {
1855      cacheManager.setCachedLocations(lb);
1856    }
1857
1858    final long now = now();
1859    boolean updateAccessTime = isAccessTimeSupported() && !isInSafeMode()
1860        && !iip.isSnapshot()
1861        && now > inode.getAccessTime() + getAccessTimePrecision();
1862    return new GetBlockLocationsResult(updateAccessTime, blocks);
1863  }
1864
1865  /**
1866   * Moves all the blocks from {@code srcs} and appends them to {@code target}
1867   * To avoid rollbacks we will verify validity of ALL of the args
1868   * before we start actual move.
1869   * 
1870   * This does not support ".inodes" relative path
1871   * @param target target to concat into
1872   * @param srcs file that will be concatenated
1873   * @throws IOException on error
1874   */
1875  void concat(String target, String [] srcs, boolean logRetryCache)
1876      throws IOException {
1877    checkOperation(OperationCategory.WRITE);
1878    waitForLoadingFSImage();
1879    HdfsFileStatus stat = null;
1880    boolean success = false;
1881    writeLock();
1882    try {
1883      checkOperation(OperationCategory.WRITE);
1884      checkNameNodeSafeMode("Cannot concat " + target);
1885      stat = FSDirConcatOp.concat(dir, target, srcs, logRetryCache);
1886      success = true;
1887    } finally {
1888      writeUnlock();
1889      if (success) {
1890        getEditLog().logSync();
1891      }
1892      logAuditEvent(success, "concat", Arrays.toString(srcs), target, stat);
1893    }
1894  }
1895
1896  /**
1897   * stores the modification and access time for this inode. 
1898   * The access time is precise up to an hour. The transaction, if needed, is
1899   * written to the edits log but is not flushed.
1900   */
1901  void setTimes(String src, long mtime, long atime) throws IOException {
1902    HdfsFileStatus auditStat;
1903    checkOperation(OperationCategory.WRITE);
1904    writeLock();
1905    try {
1906      checkOperation(OperationCategory.WRITE);
1907      checkNameNodeSafeMode("Cannot set times " + src);
1908      auditStat = FSDirAttrOp.setTimes(dir, src, mtime, atime);
1909    } catch (AccessControlException e) {
1910      logAuditEvent(false, "setTimes", src);
1911      throw e;
1912    } finally {
1913      writeUnlock();
1914    }
1915    getEditLog().logSync();
1916    logAuditEvent(true, "setTimes", src, null, auditStat);
1917  }
1918
1919  /**
1920   * Create a symbolic link.
1921   */
1922  @SuppressWarnings("deprecation")
1923  void createSymlink(String target, String link,
1924      PermissionStatus dirPerms, boolean createParent, boolean logRetryCache)
1925      throws IOException {
1926    if (!FileSystem.areSymlinksEnabled()) {
1927      throw new UnsupportedOperationException("Symlinks not supported");
1928    }
1929    HdfsFileStatus auditStat = null;
1930    checkOperation(OperationCategory.WRITE);
1931    writeLock();
1932    try {
1933      checkOperation(OperationCategory.WRITE);
1934      checkNameNodeSafeMode("Cannot create symlink " + link);
1935      auditStat = FSDirSymlinkOp.createSymlinkInt(this, target, link, dirPerms,
1936                                                  createParent, logRetryCache);
1937    } catch (AccessControlException e) {
1938      logAuditEvent(false, "createSymlink", link, target, null);
1939      throw e;
1940    } finally {
1941      writeUnlock();
1942    }
1943    getEditLog().logSync();
1944    logAuditEvent(true, "createSymlink", link, target, auditStat);
1945  }
1946
1947  /**
1948   * Set replication for an existing file.
1949   * 
1950   * The NameNode sets new replication and schedules either replication of 
1951   * under-replicated data blocks or removal of the excessive block copies 
1952   * if the blocks are over-replicated.
1953   * 
1954   * @see ClientProtocol#setReplication(String, short)
1955   * @param src file name
1956   * @param replication new replication
1957   * @return true if successful; 
1958   *         false if file does not exist or is a directory
1959   */
1960  boolean setReplication(final String src, final short replication)
1961      throws IOException {
1962    boolean success = false;
1963    waitForLoadingFSImage();
1964    checkOperation(OperationCategory.WRITE);
1965    writeLock();
1966    try {
1967      checkOperation(OperationCategory.WRITE);
1968      checkNameNodeSafeMode("Cannot set replication for " + src);
1969      success = FSDirAttrOp.setReplication(dir, blockManager, src, replication);
1970    } catch (AccessControlException e) {
1971      logAuditEvent(false, "setReplication", src);
1972      throw e;
1973    } finally {
1974      writeUnlock();
1975    }
1976    if (success) {
1977      getEditLog().logSync();
1978      logAuditEvent(true, "setReplication", src);
1979    }
1980    return success;
1981  }
1982
1983  /**
1984   * Truncate file to a lower length.
1985   * Truncate cannot be reverted / recovered from as it causes data loss.
1986   * Truncation at block boundary is atomic, otherwise it requires
1987   * block recovery to truncate the last block of the file.
1988   *
1989   * @return true if client does not need to wait for block recovery,
1990   * false if client needs to wait for block recovery.
1991   */
1992  boolean truncate(String src, long newLength,
1993                   String clientName, String clientMachine,
1994                   long mtime)
1995      throws IOException, UnresolvedLinkException {
1996    boolean ret;
1997    try {
1998      ret = truncateInt(src, newLength, clientName, clientMachine, mtime);
1999    } catch (AccessControlException e) {
2000      logAuditEvent(false, "truncate", src);
2001      throw e;
2002    }
2003    return ret;
2004  }
2005
2006  boolean truncateInt(String srcArg, long newLength,
2007                      String clientName, String clientMachine,
2008                      long mtime)
2009      throws IOException, UnresolvedLinkException {
2010    String src = srcArg;
2011    NameNode.stateChangeLog.debug(
2012        "DIR* NameSystem.truncate: src={} newLength={}", src, newLength);
2013    if (newLength < 0) {
2014      throw new HadoopIllegalArgumentException(
2015          "Cannot truncate to a negative file size: " + newLength + ".");
2016    }
2017    HdfsFileStatus stat = null;
2018    FSPermissionChecker pc = getPermissionChecker();
2019    checkOperation(OperationCategory.WRITE);
2020    boolean res;
2021    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2022    writeLock();
2023    BlocksMapUpdateInfo toRemoveBlocks = new BlocksMapUpdateInfo();
2024    try {
2025      checkOperation(OperationCategory.WRITE);
2026      checkNameNodeSafeMode("Cannot truncate for " + src);
2027      src = dir.resolvePath(pc, src, pathComponents);
2028      res = truncateInternal(src, newLength, clientName,
2029          clientMachine, mtime, pc, toRemoveBlocks);
2030      stat = dir.getAuditFileInfo(dir.getINodesInPath4Write(src, false));
2031    } finally {
2032      writeUnlock();
2033    }
2034    getEditLog().logSync();
2035    if (!toRemoveBlocks.getToDeleteList().isEmpty()) {
2036      removeBlocks(toRemoveBlocks);
2037      toRemoveBlocks.clear();
2038    }
2039    logAuditEvent(true, "truncate", src, null, stat);
2040    return res;
2041  }
2042
2043  /**
2044   * Truncate a file to a given size
2045   * Update the count at each ancestor directory with quota
2046   */
2047  boolean truncateInternal(String src, long newLength,
2048                           String clientName, String clientMachine,
2049                           long mtime, FSPermissionChecker pc,
2050                           BlocksMapUpdateInfo toRemoveBlocks)
2051      throws IOException, UnresolvedLinkException {
2052    assert hasWriteLock();
2053    INodesInPath iip = dir.getINodesInPath4Write(src, true);
2054    if (isPermissionEnabled) {
2055      dir.checkPathAccess(pc, iip, FsAction.WRITE);
2056    }
2057    INodeFile file = INodeFile.valueOf(iip.getLastINode(), src);
2058    final BlockStoragePolicy lpPolicy =
2059        blockManager.getStoragePolicy("LAZY_PERSIST");
2060
2061    if (lpPolicy != null &&
2062        lpPolicy.getId() == file.getStoragePolicyID()) {
2063      throw new UnsupportedOperationException(
2064          "Cannot truncate lazy persist file " + src);
2065    }
2066
2067    // Check if the file is already being truncated with the same length
2068    final BlockInfoContiguous last = file.getLastBlock();
2069    if (last != null && last.getBlockUCState() == BlockUCState.UNDER_RECOVERY) {
2070      final Block truncateBlock
2071          = ((BlockInfoContiguousUnderConstruction)last).getTruncateBlock();
2072      if (truncateBlock != null) {
2073        final long truncateLength = file.computeFileSize(false, false)
2074            + truncateBlock.getNumBytes();
2075        if (newLength == truncateLength) {
2076          return false;
2077        }
2078      }
2079    }
2080
2081    // Opening an existing file for truncate. May need lease recovery.
2082    recoverLeaseInternal(RecoverLeaseOp.TRUNCATE_FILE,
2083        iip, src, clientName, clientMachine, false);
2084    // Truncate length check.
2085    long oldLength = file.computeFileSize();
2086    if(oldLength == newLength) {
2087      return true;
2088    }
2089    if(oldLength < newLength) {
2090      throw new HadoopIllegalArgumentException(
2091          "Cannot truncate to a larger file size. Current size: " + oldLength +
2092              ", truncate size: " + newLength + ".");
2093    }
2094    // Perform INodeFile truncation.
2095    final QuotaCounts delta = new QuotaCounts.Builder().build();
2096    boolean onBlockBoundary = dir.truncate(iip, newLength, toRemoveBlocks,
2097        mtime, delta);
2098    Block truncateBlock = null;
2099    if(!onBlockBoundary) {
2100      // Open file for write, but don't log into edits
2101      long lastBlockDelta = file.computeFileSize() - newLength;
2102      assert lastBlockDelta > 0 : "delta is 0 only if on block bounday";
2103      truncateBlock = prepareFileForTruncate(iip, clientName, clientMachine,
2104          lastBlockDelta, null);
2105    }
2106
2107    // update the quota: use the preferred block size for UC block
2108    dir.writeLock();
2109    try {
2110      dir.updateCountNoQuotaCheck(iip, iip.length() - 1, delta);
2111    } finally {
2112      dir.writeUnlock();
2113    }
2114
2115    getEditLog().logTruncate(src, clientName, clientMachine, newLength, mtime,
2116        truncateBlock);
2117    return onBlockBoundary;
2118  }
2119
2120  /**
2121   * Convert current INode to UnderConstruction.
2122   * Recreate lease.
2123   * Create new block for the truncated copy.
2124   * Schedule truncation of the replicas.
2125   *
2126   * @return the returned block will be written to editLog and passed back into
2127   * this method upon loading.
2128   */
2129  Block prepareFileForTruncate(INodesInPath iip,
2130                               String leaseHolder,
2131                               String clientMachine,
2132                               long lastBlockDelta,
2133                               Block newBlock)
2134      throws IOException {
2135    INodeFile file = iip.getLastINode().asFile();
2136    String src = iip.getPath();
2137    file.recordModification(iip.getLatestSnapshotId());
2138    file.toUnderConstruction(leaseHolder, clientMachine);
2139    assert file.isUnderConstruction() : "inode should be under construction.";
2140    leaseManager.addLease(
2141        file.getFileUnderConstructionFeature().getClientName(), src);
2142    boolean shouldRecoverNow = (newBlock == null);
2143    BlockInfoContiguous oldBlock = file.getLastBlock();
2144    boolean shouldCopyOnTruncate = shouldCopyOnTruncate(file, oldBlock);
2145    if(newBlock == null) {
2146      newBlock = (shouldCopyOnTruncate) ? createNewBlock() :
2147          new Block(oldBlock.getBlockId(), oldBlock.getNumBytes(),
2148              nextGenerationStamp(blockIdManager.isLegacyBlock(oldBlock)));
2149    }
2150
2151    BlockInfoContiguousUnderConstruction truncatedBlockUC;
2152    if(shouldCopyOnTruncate) {
2153      // Add new truncateBlock into blocksMap and
2154      // use oldBlock as a source for copy-on-truncate recovery
2155      truncatedBlockUC = new BlockInfoContiguousUnderConstruction(newBlock,
2156          file.getBlockReplication());
2157      truncatedBlockUC.setNumBytes(oldBlock.getNumBytes() - lastBlockDelta);
2158      truncatedBlockUC.setTruncateBlock(oldBlock);
2159      file.setLastBlock(truncatedBlockUC, blockManager.getStorages(oldBlock));
2160      getBlockManager().addBlockCollection(truncatedBlockUC, file);
2161
2162      NameNode.stateChangeLog.debug(
2163          "BLOCK* prepareFileForTruncate: Scheduling copy-on-truncate to new" +
2164          " size {}  new block {} old block {}", truncatedBlockUC.getNumBytes(),
2165          newBlock, truncatedBlockUC.getTruncateBlock());
2166    } else {
2167      // Use new generation stamp for in-place truncate recovery
2168      blockManager.convertLastBlockToUnderConstruction(file, lastBlockDelta);
2169      oldBlock = file.getLastBlock();
2170      assert !oldBlock.isComplete() : "oldBlock should be under construction";
2171      truncatedBlockUC = (BlockInfoContiguousUnderConstruction) oldBlock;
2172      truncatedBlockUC.setTruncateBlock(new Block(oldBlock));
2173      truncatedBlockUC.getTruncateBlock().setNumBytes(
2174          oldBlock.getNumBytes() - lastBlockDelta);
2175      truncatedBlockUC.getTruncateBlock().setGenerationStamp(
2176          newBlock.getGenerationStamp());
2177
2178      NameNode.stateChangeLog.debug(
2179          "BLOCK* prepareFileForTruncate: {} Scheduling in-place block " +
2180          "truncate to new size {}",
2181          truncatedBlockUC.getTruncateBlock().getNumBytes(), truncatedBlockUC);
2182    }
2183    if (shouldRecoverNow) {
2184      truncatedBlockUC.initializeBlockRecovery(newBlock.getGenerationStamp());
2185    }
2186
2187    return newBlock;
2188  }
2189
2190  /**
2191   * Defines if a replica needs to be copied on truncate or
2192   * can be truncated in place.
2193   */
2194  boolean shouldCopyOnTruncate(INodeFile file, BlockInfoContiguous blk) {
2195    if(!isUpgradeFinalized()) {
2196      return true;
2197    }
2198    if (isRollingUpgrade()) {
2199      return true;
2200    }
2201    return file.isBlockInLatestSnapshot(blk);
2202  }
2203
2204  /**
2205   * Set the storage policy for a file or a directory.
2206   *
2207   * @param src file/directory path
2208   * @param policyName storage policy name
2209   */
2210  void setStoragePolicy(String src, String policyName) throws IOException {
2211    HdfsFileStatus auditStat;
2212    waitForLoadingFSImage();
2213    checkOperation(OperationCategory.WRITE);
2214    writeLock();
2215    try {
2216      checkOperation(OperationCategory.WRITE);
2217      checkNameNodeSafeMode("Cannot set storage policy for " + src);
2218      auditStat = FSDirAttrOp.setStoragePolicy(
2219          dir, blockManager, src, policyName);
2220    } catch (AccessControlException e) {
2221      logAuditEvent(false, "setStoragePolicy", src);
2222      throw e;
2223    } finally {
2224      writeUnlock();
2225    }
2226    getEditLog().logSync();
2227    logAuditEvent(true, "setStoragePolicy", src, null, auditStat);
2228  }
2229
2230  /**
2231   * @return All the existing block storage policies
2232   */
2233  BlockStoragePolicy[] getStoragePolicies() throws IOException {
2234    checkOperation(OperationCategory.READ);
2235    waitForLoadingFSImage();
2236    readLock();
2237    try {
2238      checkOperation(OperationCategory.READ);
2239      return FSDirAttrOp.getStoragePolicies(blockManager);
2240    } finally {
2241      readUnlock();
2242    }
2243  }
2244
2245  long getPreferredBlockSize(String src) throws IOException {
2246    checkOperation(OperationCategory.READ);
2247    readLock();
2248    try {
2249      checkOperation(OperationCategory.READ);
2250      return FSDirAttrOp.getPreferredBlockSize(dir, src);
2251    } finally {
2252      readUnlock();
2253    }
2254  }
2255
2256  /**
2257   * If the file is within an encryption zone, select the appropriate 
2258   * CryptoProtocolVersion from the list provided by the client. Since the
2259   * client may be newer, we need to handle unknown versions.
2260   *
2261   * @param zone EncryptionZone of the file
2262   * @param supportedVersions List of supported protocol versions
2263   * @return chosen protocol version
2264   * @throws IOException
2265   */
2266  private CryptoProtocolVersion chooseProtocolVersion(EncryptionZone zone,
2267      CryptoProtocolVersion[] supportedVersions)
2268      throws UnknownCryptoProtocolVersionException, UnresolvedLinkException,
2269        SnapshotAccessControlException {
2270    Preconditions.checkNotNull(zone);
2271    Preconditions.checkNotNull(supportedVersions);
2272    // Right now, we only support a single protocol version,
2273    // so simply look for it in the list of provided options
2274    final CryptoProtocolVersion required = zone.getVersion();
2275
2276    for (CryptoProtocolVersion c : supportedVersions) {
2277      if (c.equals(CryptoProtocolVersion.UNKNOWN)) {
2278        if (LOG.isDebugEnabled()) {
2279          LOG.debug("Ignoring unknown CryptoProtocolVersion provided by " +
2280              "client: " + c.getUnknownValue());
2281        }
2282        continue;
2283      }
2284      if (c.equals(required)) {
2285        return c;
2286      }
2287    }
2288    throw new UnknownCryptoProtocolVersionException(
2289        "No crypto protocol versions provided by the client are supported."
2290            + " Client provided: " + Arrays.toString(supportedVersions)
2291            + " NameNode supports: " + Arrays.toString(CryptoProtocolVersion
2292            .values()));
2293  }
2294
2295  /**
2296   * Invoke KeyProvider APIs to generate an encrypted data encryption key for an
2297   * encryption zone. Should not be called with any locks held.
2298   *
2299   * @param ezKeyName key name of an encryption zone
2300   * @return New EDEK, or null if ezKeyName is null
2301   * @throws IOException
2302   */
2303  private EncryptedKeyVersion generateEncryptedDataEncryptionKey(String
2304      ezKeyName) throws IOException {
2305    if (ezKeyName == null) {
2306      return null;
2307    }
2308    EncryptedKeyVersion edek = null;
2309    try {
2310      edek = provider.generateEncryptedKey(ezKeyName);
2311    } catch (GeneralSecurityException e) {
2312      throw new IOException(e);
2313    }
2314    Preconditions.checkNotNull(edek);
2315    return edek;
2316  }
2317
2318  /**
2319   * Create a new file entry in the namespace.
2320   * 
2321   * For description of parameters and exceptions thrown see
2322   * {@link ClientProtocol#create}, except it returns valid file status upon
2323   * success
2324   */
2325  HdfsFileStatus startFile(String src, PermissionStatus permissions,
2326      String holder, String clientMachine, EnumSet<CreateFlag> flag,
2327      boolean createParent, short replication, long blockSize, 
2328      CryptoProtocolVersion[] supportedVersions, boolean logRetryCache)
2329      throws AccessControlException, SafeModeException,
2330      FileAlreadyExistsException, UnresolvedLinkException,
2331      FileNotFoundException, ParentNotDirectoryException, IOException {
2332
2333    HdfsFileStatus status = null;
2334    try {
2335      status = startFileInt(src, permissions, holder, clientMachine, flag,
2336          createParent, replication, blockSize, supportedVersions,
2337          logRetryCache);
2338    } catch (AccessControlException e) {
2339      logAuditEvent(false, "create", src);
2340      throw e;
2341    }
2342    return status;
2343  }
2344
2345  private HdfsFileStatus startFileInt(final String srcArg,
2346      PermissionStatus permissions, String holder, String clientMachine,
2347      EnumSet<CreateFlag> flag, boolean createParent, short replication,
2348      long blockSize, CryptoProtocolVersion[] supportedVersions,
2349      boolean logRetryCache)
2350      throws AccessControlException, SafeModeException,
2351      FileAlreadyExistsException, UnresolvedLinkException,
2352      FileNotFoundException, ParentNotDirectoryException, IOException {
2353    String src = srcArg;
2354    if (NameNode.stateChangeLog.isDebugEnabled()) {
2355      StringBuilder builder = new StringBuilder();
2356      builder.append("DIR* NameSystem.startFile: src=" + src
2357              + ", holder=" + holder
2358              + ", clientMachine=" + clientMachine
2359              + ", createParent=" + createParent
2360              + ", replication=" + replication
2361              + ", createFlag=" + flag.toString()
2362              + ", blockSize=" + blockSize);
2363      builder.append(", supportedVersions=");
2364      if (supportedVersions != null) {
2365        builder.append(Arrays.toString(supportedVersions));
2366      } else {
2367        builder.append("null");
2368      }
2369      NameNode.stateChangeLog.debug(builder.toString());
2370    }
2371    if (!DFSUtil.isValidName(src)) {
2372      throw new InvalidPathException(src);
2373    }
2374    blockManager.verifyReplication(src, replication, clientMachine);
2375
2376    boolean skipSync = false;
2377    HdfsFileStatus stat = null;
2378    FSPermissionChecker pc = getPermissionChecker();
2379    checkOperation(OperationCategory.WRITE);
2380    if (blockSize < minBlockSize) {
2381      throw new IOException("Specified block size is less than configured" +
2382          " minimum value (" + DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY
2383          + "): " + blockSize + " < " + minBlockSize);
2384    }
2385    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2386    boolean create = flag.contains(CreateFlag.CREATE);
2387    boolean overwrite = flag.contains(CreateFlag.OVERWRITE);
2388    boolean isLazyPersist = flag.contains(CreateFlag.LAZY_PERSIST);
2389
2390    waitForLoadingFSImage();
2391
2392    /**
2393     * If the file is in an encryption zone, we optimistically create an
2394     * EDEK for the file by calling out to the configured KeyProvider.
2395     * Since this typically involves doing an RPC, we take the readLock
2396     * initially, then drop it to do the RPC.
2397     * 
2398     * Since the path can flip-flop between being in an encryption zone and not
2399     * in the meantime, we need to recheck the preconditions when we retake the
2400     * lock to do the create. If the preconditions are not met, we throw a
2401     * special RetryStartFileException to ask the DFSClient to try the create
2402     * again later.
2403     */
2404    CryptoProtocolVersion protocolVersion = null;
2405    CipherSuite suite = null;
2406    String ezKeyName = null;
2407    EncryptedKeyVersion edek = null;
2408
2409    if (provider != null) {
2410      readLock();
2411      try {
2412        src = dir.resolvePath(pc, src, pathComponents);
2413        INodesInPath iip = dir.getINodesInPath4Write(src);
2414        // Nothing to do if the path is not within an EZ
2415        final EncryptionZone zone = dir.getEZForPath(iip);
2416        if (zone != null) {
2417          protocolVersion = chooseProtocolVersion(zone, supportedVersions);
2418          suite = zone.getSuite();
2419          ezKeyName = zone.getKeyName();
2420
2421          Preconditions.checkNotNull(protocolVersion);
2422          Preconditions.checkNotNull(suite);
2423          Preconditions.checkArgument(!suite.equals(CipherSuite.UNKNOWN),
2424              "Chose an UNKNOWN CipherSuite!");
2425          Preconditions.checkNotNull(ezKeyName);
2426        }
2427      } finally {
2428        readUnlock();
2429      }
2430
2431      Preconditions.checkState(
2432          (suite == null && ezKeyName == null) ||
2433              (suite != null && ezKeyName != null),
2434          "Both suite and ezKeyName should both be null or not null");
2435
2436      // Generate EDEK if necessary while not holding the lock
2437      edek = generateEncryptedDataEncryptionKey(ezKeyName);
2438      EncryptionFaultInjector.getInstance().startFileAfterGenerateKey();
2439    }
2440
2441    // Proceed with the create, using the computed cipher suite and 
2442    // generated EDEK
2443    BlocksMapUpdateInfo toRemoveBlocks = null;
2444    writeLock();
2445    try {
2446      checkOperation(OperationCategory.WRITE);
2447      checkNameNodeSafeMode("Cannot create file" + src);
2448      dir.writeLock();
2449      try {
2450        src = dir.resolvePath(pc, src, pathComponents);
2451        final INodesInPath iip = dir.getINodesInPath4Write(src);
2452        toRemoveBlocks = startFileInternal(
2453            pc, iip, permissions, holder,
2454            clientMachine, create, overwrite,
2455            createParent, replication, blockSize,
2456            isLazyPersist, suite, protocolVersion, edek,
2457            logRetryCache);
2458        stat = FSDirStatAndListingOp.getFileInfo(
2459            dir, src, false, FSDirectory.isReservedRawName(srcArg), true);
2460      } finally {
2461        dir.writeUnlock();
2462      }
2463    } catch (StandbyException se) {
2464      skipSync = true;
2465      throw se;
2466    } finally {
2467      writeUnlock();
2468      // There might be transactions logged while trying to recover the lease.
2469      // They need to be sync'ed even when an exception was thrown.
2470      if (!skipSync) {
2471        getEditLog().logSync();
2472        if (toRemoveBlocks != null) {
2473          removeBlocks(toRemoveBlocks);
2474          toRemoveBlocks.clear();
2475        }
2476      }
2477    }
2478
2479    logAuditEvent(true, "create", srcArg, null, stat);
2480    return stat;
2481  }
2482
2483  /**
2484   * Create a new file or overwrite an existing file<br>
2485   * 
2486   * Once the file is create the client then allocates a new block with the next
2487   * call using {@link ClientProtocol#addBlock}.
2488   * <p>
2489   * For description of parameters and exceptions thrown see
2490   * {@link ClientProtocol#create}
2491   */
2492  private BlocksMapUpdateInfo startFileInternal(FSPermissionChecker pc, 
2493      INodesInPath iip, PermissionStatus permissions, String holder,
2494      String clientMachine, boolean create, boolean overwrite, 
2495      boolean createParent, short replication, long blockSize, 
2496      boolean isLazyPersist, CipherSuite suite, CryptoProtocolVersion version,
2497      EncryptedKeyVersion edek, boolean logRetryEntry)
2498      throws IOException {
2499    assert hasWriteLock();
2500    // Verify that the destination does not exist as a directory already.
2501    final INode inode = iip.getLastINode();
2502    final String src = iip.getPath();
2503    if (inode != null && inode.isDirectory()) {
2504      throw new FileAlreadyExistsException(src +
2505          " already exists as a directory");
2506    }
2507
2508    final INodeFile myFile = INodeFile.valueOf(inode, src, true);
2509    if (isPermissionEnabled) {
2510      if (overwrite && myFile != null) {
2511        dir.checkPathAccess(pc, iip, FsAction.WRITE);
2512      }
2513      /*
2514       * To overwrite existing file, need to check 'w' permission 
2515       * of parent (equals to ancestor in this case)
2516       */
2517      dir.checkAncestorAccess(pc, iip, FsAction.WRITE);
2518    }
2519    if (!createParent) {
2520      dir.verifyParentDir(iip, src);
2521    }
2522
2523    FileEncryptionInfo feInfo = null;
2524
2525    final EncryptionZone zone = dir.getEZForPath(iip);
2526    if (zone != null) {
2527      // The path is now within an EZ, but we're missing encryption parameters
2528      if (suite == null || edek == null) {
2529        throw new RetryStartFileException();
2530      }
2531      // Path is within an EZ and we have provided encryption parameters.
2532      // Make sure that the generated EDEK matches the settings of the EZ.
2533      final String ezKeyName = zone.getKeyName();
2534      if (!ezKeyName.equals(edek.getEncryptionKeyName())) {
2535        throw new RetryStartFileException();
2536      }
2537      feInfo = new FileEncryptionInfo(suite, version,
2538          edek.getEncryptedKeyVersion().getMaterial(),
2539          edek.getEncryptedKeyIv(),
2540          ezKeyName, edek.getEncryptionKeyVersionName());
2541    }
2542
2543    try {
2544      BlocksMapUpdateInfo toRemoveBlocks = null;
2545      if (myFile == null) {
2546        if (!create) {
2547          throw new FileNotFoundException("Can't overwrite non-existent " +
2548              src + " for client " + clientMachine);
2549        }
2550      } else {
2551        if (overwrite) {
2552          toRemoveBlocks = new BlocksMapUpdateInfo();
2553          List<INode> toRemoveINodes = new ChunkedArrayList<INode>();
2554          long ret = FSDirDeleteOp.delete(dir, iip, toRemoveBlocks,
2555                                          toRemoveINodes, now());
2556          if (ret >= 0) {
2557            iip = INodesInPath.replace(iip, iip.length() - 1, null);
2558            FSDirDeleteOp.incrDeletedFileCount(ret);
2559            removeLeasesAndINodes(src, toRemoveINodes, true);
2560          }
2561        } else {
2562          // If lease soft limit time is expired, recover the lease
2563          recoverLeaseInternal(RecoverLeaseOp.CREATE_FILE,
2564              iip, src, holder, clientMachine, false);
2565          throw new FileAlreadyExistsException(src + " for client " +
2566              clientMachine + " already exists");
2567        }
2568      }
2569
2570      checkFsObjectLimit();
2571      INodeFile newNode = null;
2572
2573      // Always do an implicit mkdirs for parent directory tree.
2574      Map.Entry<INodesInPath, String> parent = FSDirMkdirOp
2575          .createAncestorDirectories(dir, iip, permissions);
2576      if (parent != null) {
2577        iip = dir.addFile(parent.getKey(), parent.getValue(), permissions,
2578            replication, blockSize, holder, clientMachine);
2579        newNode = iip != null ? iip.getLastINode().asFile() : null;
2580      }
2581
2582      if (newNode == null) {
2583        throw new IOException("Unable to add " + src +  " to namespace");
2584      }
2585      leaseManager.addLease(newNode.getFileUnderConstructionFeature()
2586          .getClientName(), src);
2587
2588      // Set encryption attributes if necessary
2589      if (feInfo != null) {
2590        dir.setFileEncryptionInfo(src, feInfo);
2591        newNode = dir.getInode(newNode.getId()).asFile();
2592      }
2593
2594      setNewINodeStoragePolicy(newNode, iip, isLazyPersist);
2595
2596      // record file record in log, record new generation stamp
2597      getEditLog().logOpenFile(src, newNode, overwrite, logRetryEntry);
2598      NameNode.stateChangeLog.debug("DIR* NameSystem.startFile: added {}" +
2599          " inode {} holder {}", src, newNode.getId(), holder);
2600      return toRemoveBlocks;
2601    } catch (IOException ie) {
2602      NameNode.stateChangeLog.warn("DIR* NameSystem.startFile: " + src + " " +
2603          ie.getMessage());
2604      throw ie;
2605    }
2606  }
2607
2608  private void setNewINodeStoragePolicy(INodeFile inode,
2609                                        INodesInPath iip,
2610                                        boolean isLazyPersist)
2611      throws IOException {
2612
2613    if (isLazyPersist) {
2614      BlockStoragePolicy lpPolicy =
2615          blockManager.getStoragePolicy("LAZY_PERSIST");
2616
2617      // Set LAZY_PERSIST storage policy if the flag was passed to
2618      // CreateFile.
2619      if (lpPolicy == null) {
2620        throw new HadoopIllegalArgumentException(
2621            "The LAZY_PERSIST storage policy has been disabled " +
2622            "by the administrator.");
2623      }
2624      inode.setStoragePolicyID(lpPolicy.getId(),
2625                                 iip.getLatestSnapshotId());
2626    } else {
2627      BlockStoragePolicy effectivePolicy =
2628          blockManager.getStoragePolicy(inode.getStoragePolicyID());
2629
2630      if (effectivePolicy != null &&
2631          effectivePolicy.isCopyOnCreateFile()) {
2632        // Copy effective policy from ancestor directory to current file.
2633        inode.setStoragePolicyID(effectivePolicy.getId(),
2634                                 iip.getLatestSnapshotId());
2635      }
2636    }
2637  }
2638
2639  /**
2640   * Append to an existing file for append.
2641   * <p>
2642   * 
2643   * The method returns the last block of the file if this is a partial block,
2644   * which can still be used for writing more data. The client uses the returned
2645   * block locations to form the data pipeline for this block.<br>
2646   * The method returns null if the last block is full. The client then
2647   * allocates a new block with the next call using
2648   * {@link ClientProtocol#addBlock}.
2649   * <p>
2650   * 
2651   * For description of parameters and exceptions thrown see
2652   * {@link ClientProtocol#append(String, String, EnumSetWritable)}
2653   *
2654   * @return the last block locations if the block is partial or null otherwise
2655   */
2656  private LocatedBlock appendFileInternal(FSPermissionChecker pc,
2657      INodesInPath iip, String holder, String clientMachine, boolean newBlock,
2658      boolean logRetryCache) throws IOException {
2659    assert hasWriteLock();
2660    // Verify that the destination does not exist as a directory already.
2661    final INode inode = iip.getLastINode();
2662    final String src = iip.getPath();
2663    if (inode != null && inode.isDirectory()) {
2664      throw new FileAlreadyExistsException("Cannot append to directory " + src
2665          + "; already exists as a directory.");
2666    }
2667    if (isPermissionEnabled) {
2668      dir.checkPathAccess(pc, iip, FsAction.WRITE);
2669    }
2670
2671    try {
2672      if (inode == null) {
2673        throw new FileNotFoundException("failed to append to non-existent file "
2674          + src + " for client " + clientMachine);
2675      }
2676      INodeFile myFile = INodeFile.valueOf(inode, src, true);
2677      final BlockStoragePolicy lpPolicy =
2678          blockManager.getStoragePolicy("LAZY_PERSIST");
2679      if (lpPolicy != null &&
2680          lpPolicy.getId() == myFile.getStoragePolicyID()) {
2681        throw new UnsupportedOperationException(
2682            "Cannot append to lazy persist file " + src);
2683      }
2684      // Opening an existing file for append - may need to recover lease.
2685      recoverLeaseInternal(RecoverLeaseOp.APPEND_FILE,
2686          iip, src, holder, clientMachine, false);
2687      
2688      final BlockInfoContiguous lastBlock = myFile.getLastBlock();
2689      // Check that the block has at least minimum replication.
2690      if(lastBlock != null && lastBlock.isComplete() &&
2691          !getBlockManager().isSufficientlyReplicated(lastBlock)) {
2692        throw new IOException("append: lastBlock=" + lastBlock +
2693            " of src=" + src + " is not sufficiently replicated yet.");
2694      }
2695      return prepareFileForAppend(src, iip, holder, clientMachine, newBlock,
2696          true, logRetryCache);
2697    } catch (IOException ie) {
2698      NameNode.stateChangeLog.warn("DIR* NameSystem.append: " +ie.getMessage());
2699      throw ie;
2700    }
2701  }
2702  
2703  /**
2704   * Convert current node to under construction.
2705   * Recreate in-memory lease record.
2706   * 
2707   * @param src path to the file
2708   * @param leaseHolder identifier of the lease holder on this file
2709   * @param clientMachine identifier of the client machine
2710   * @param newBlock if the data is appended to a new block
2711   * @param writeToEditLog whether to persist this change to the edit log
2712   * @param logRetryCache whether to record RPC ids in editlog for retry cache
2713   *                      rebuilding
2714   * @return the last block locations if the block is partial or null otherwise
2715   * @throws UnresolvedLinkException
2716   * @throws IOException
2717   */
2718  LocatedBlock prepareFileForAppend(String src, INodesInPath iip,
2719      String leaseHolder, String clientMachine, boolean newBlock,
2720      boolean writeToEditLog, boolean logRetryCache) throws IOException {
2721    final INodeFile file = iip.getLastINode().asFile();
2722    final QuotaCounts delta = verifyQuotaForUCBlock(file, iip);
2723
2724    file.recordModification(iip.getLatestSnapshotId());
2725    file.toUnderConstruction(leaseHolder, clientMachine);
2726
2727    leaseManager.addLease(
2728        file.getFileUnderConstructionFeature().getClientName(), src);
2729
2730    LocatedBlock ret = null;
2731    if (!newBlock) {
2732      ret = blockManager.convertLastBlockToUnderConstruction(file, 0);
2733      if (ret != null && delta != null) {
2734        Preconditions.checkState(delta.getStorageSpace() >= 0,
2735            "appending to a block with size larger than the preferred block size");
2736        dir.writeLock();
2737        try {
2738          dir.updateCountNoQuotaCheck(iip, iip.length() - 1, delta);
2739        } finally {
2740          dir.writeUnlock();
2741        }
2742      }
2743    } else {
2744      BlockInfoContiguous lastBlock = file.getLastBlock();
2745      if (lastBlock != null) {
2746        ExtendedBlock blk = new ExtendedBlock(this.getBlockPoolId(), lastBlock);
2747        ret = new LocatedBlock(blk, new DatanodeInfo[0]);
2748      }
2749    }
2750
2751    if (writeToEditLog) {
2752      getEditLog().logAppendFile(src, file, newBlock, logRetryCache);
2753    }
2754    return ret;
2755  }
2756
2757  /**
2758   * Verify quota when using the preferred block size for UC block. This is
2759   * usually used by append and truncate
2760   * @throws QuotaExceededException when violating the storage quota
2761   * @return expected quota usage update. null means no change or no need to
2762   *         update quota usage later
2763   */
2764  private QuotaCounts verifyQuotaForUCBlock(INodeFile file, INodesInPath iip)
2765      throws QuotaExceededException {
2766    if (!isImageLoaded() || dir.shouldSkipQuotaChecks()) {
2767      // Do not check quota if editlog is still being processed
2768      return null;
2769    }
2770    if (file.getLastBlock() != null) {
2771      final QuotaCounts delta = computeQuotaDeltaForUCBlock(file);
2772      dir.readLock();
2773      try {
2774        FSDirectory.verifyQuota(iip, iip.length() - 1, delta, null);
2775        return delta;
2776      } finally {
2777        dir.readUnlock();
2778      }
2779    }
2780    return null;
2781  }
2782
2783  /** Compute quota change for converting a complete block to a UC block */
2784  private QuotaCounts computeQuotaDeltaForUCBlock(INodeFile file) {
2785    final QuotaCounts delta = new QuotaCounts.Builder().build();
2786    final BlockInfoContiguous lastBlock = file.getLastBlock();
2787    if (lastBlock != null) {
2788      final long diff = file.getPreferredBlockSize() - lastBlock.getNumBytes();
2789      final short repl = file.getBlockReplication();
2790      delta.addStorageSpace(diff * repl);
2791      final BlockStoragePolicy policy = dir.getBlockStoragePolicySuite()
2792          .getPolicy(file.getStoragePolicyID());
2793      List<StorageType> types = policy.chooseStorageTypes(repl);
2794      for (StorageType t : types) {
2795        if (t.supportTypeQuota()) {
2796          delta.addTypeSpace(t, diff);
2797        }
2798      }
2799    }
2800    return delta;
2801  }
2802
2803  /**
2804   * Recover lease;
2805   * Immediately revoke the lease of the current lease holder and start lease
2806   * recovery so that the file can be forced to be closed.
2807   * 
2808   * @param src the path of the file to start lease recovery
2809   * @param holder the lease holder's name
2810   * @param clientMachine the client machine's name
2811   * @return true if the file is already closed or
2812   *         if the lease can be released and the file can be closed.
2813   * @throws IOException
2814   */
2815  boolean recoverLease(String src, String holder, String clientMachine)
2816      throws IOException {
2817    if (!DFSUtil.isValidName(src)) {
2818      throw new IOException("Invalid file name: " + src);
2819    }
2820  
2821    boolean skipSync = false;
2822    FSPermissionChecker pc = getPermissionChecker();
2823    checkOperation(OperationCategory.WRITE);
2824    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2825    writeLock();
2826    try {
2827      checkOperation(OperationCategory.WRITE);
2828      checkNameNodeSafeMode("Cannot recover the lease of " + src);
2829      src = dir.resolvePath(pc, src, pathComponents);
2830      final INodesInPath iip = dir.getINodesInPath4Write(src);
2831      final INodeFile inode = INodeFile.valueOf(iip.getLastINode(), src);
2832      if (!inode.isUnderConstruction()) {
2833        return true;
2834      }
2835      if (isPermissionEnabled) {
2836        dir.checkPathAccess(pc, iip, FsAction.WRITE);
2837      }
2838  
2839      return recoverLeaseInternal(RecoverLeaseOp.RECOVER_LEASE,
2840          iip, src, holder, clientMachine, true);
2841    } catch (StandbyException se) {
2842      skipSync = true;
2843      throw se;
2844    } finally {
2845      writeUnlock();
2846      // There might be transactions logged while trying to recover the lease.
2847      // They need to be sync'ed even when an exception was thrown.
2848      if (!skipSync) {
2849        getEditLog().logSync();
2850      }
2851    }
2852  }
2853
2854  private enum RecoverLeaseOp {
2855    CREATE_FILE,
2856    APPEND_FILE,
2857    TRUNCATE_FILE,
2858    RECOVER_LEASE;
2859    
2860    private String getExceptionMessage(String src, String holder,
2861        String clientMachine, String reason) {
2862      return "Failed to " + this + " " + src + " for " + holder +
2863          " on " + clientMachine + " because " + reason;
2864    }
2865  }
2866
2867  boolean recoverLeaseInternal(RecoverLeaseOp op, INodesInPath iip,
2868      String src, String holder, String clientMachine, boolean force)
2869      throws IOException {
2870    assert hasWriteLock();
2871    INodeFile file = iip.getLastINode().asFile();
2872    if (file.isUnderConstruction()) {
2873      //
2874      // If the file is under construction , then it must be in our
2875      // leases. Find the appropriate lease record.
2876      //
2877      Lease lease = leaseManager.getLease(holder);
2878
2879      if (!force && lease != null) {
2880        Lease leaseFile = leaseManager.getLeaseByPath(src);
2881        if (leaseFile != null && leaseFile.equals(lease)) {
2882          // We found the lease for this file but the original
2883          // holder is trying to obtain it again.
2884          throw new AlreadyBeingCreatedException(
2885              op.getExceptionMessage(src, holder, clientMachine,
2886                  holder + " is already the current lease holder."));
2887        }
2888      }
2889      //
2890      // Find the original holder.
2891      //
2892      FileUnderConstructionFeature uc = file.getFileUnderConstructionFeature();
2893      String clientName = uc.getClientName();
2894      lease = leaseManager.getLease(clientName);
2895      if (lease == null) {
2896        throw new AlreadyBeingCreatedException(
2897            op.getExceptionMessage(src, holder, clientMachine,
2898                "the file is under construction but no leases found."));
2899      }
2900      if (force) {
2901        // close now: no need to wait for soft lease expiration and 
2902        // close only the file src
2903        LOG.info("recoverLease: " + lease + ", src=" + src +
2904          " from client " + clientName);
2905        return internalReleaseLease(lease, src, iip, holder);
2906      } else {
2907        assert lease.getHolder().equals(clientName) :
2908          "Current lease holder " + lease.getHolder() +
2909          " does not match file creator " + clientName;
2910        //
2911        // If the original holder has not renewed in the last SOFTLIMIT 
2912        // period, then start lease recovery.
2913        //
2914        if (lease.expiredSoftLimit()) {
2915          LOG.info("startFile: recover " + lease + ", src=" + src + " client "
2916              + clientName);
2917          if (internalReleaseLease(lease, src, iip, null)) {
2918            return true;
2919          } else {
2920            throw new RecoveryInProgressException(
2921                op.getExceptionMessage(src, holder, clientMachine,
2922                    "lease recovery is in progress. Try again later."));
2923          }
2924        } else {
2925          final BlockInfoContiguous lastBlock = file.getLastBlock();
2926          if (lastBlock != null
2927              && lastBlock.getBlockUCState() == BlockUCState.UNDER_RECOVERY) {
2928            throw new RecoveryInProgressException(
2929                op.getExceptionMessage(src, holder, clientMachine,
2930                    "another recovery is in progress by "
2931                        + clientName + " on " + uc.getClientMachine()));
2932          } else {
2933            throw new AlreadyBeingCreatedException(
2934                op.getExceptionMessage(src, holder, clientMachine,
2935                    "this file lease is currently owned by "
2936                        + clientName + " on " + uc.getClientMachine()));
2937          }
2938        }
2939      }
2940    } else {
2941      return true;
2942     }
2943  }
2944
2945  /**
2946   * Append to an existing file in the namespace.
2947   */
2948  LastBlockWithStatus appendFile(String src, String holder,
2949      String clientMachine, EnumSet<CreateFlag> flag, boolean logRetryCache)
2950      throws IOException {
2951    try {
2952      return appendFileInt(src, holder, clientMachine,
2953          flag.contains(CreateFlag.NEW_BLOCK), logRetryCache);
2954    } catch (AccessControlException e) {
2955      logAuditEvent(false, "append", src);
2956      throw e;
2957    }
2958  }
2959
2960  private LastBlockWithStatus appendFileInt(final String srcArg, String holder,
2961      String clientMachine, boolean newBlock, boolean logRetryCache)
2962      throws IOException {
2963    String src = srcArg;
2964    NameNode.stateChangeLog.debug(
2965        "DIR* NameSystem.appendFile: src={}, holder={}, clientMachine={}",
2966        src, holder, clientMachine);
2967    boolean skipSync = false;
2968    if (!supportAppends) {
2969      throw new UnsupportedOperationException(
2970          "Append is not enabled on this NameNode. Use the " +
2971          DFS_SUPPORT_APPEND_KEY + " configuration option to enable it.");
2972    }
2973
2974    LocatedBlock lb = null;
2975    HdfsFileStatus stat = null;
2976    FSPermissionChecker pc = getPermissionChecker();
2977    checkOperation(OperationCategory.WRITE);
2978    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2979    writeLock();
2980    try {
2981      checkOperation(OperationCategory.WRITE);
2982      checkNameNodeSafeMode("Cannot append to file" + src);
2983      src = dir.resolvePath(pc, src, pathComponents);
2984      final INodesInPath iip = dir.getINodesInPath4Write(src);
2985      lb = appendFileInternal(pc, iip, holder, clientMachine, newBlock,
2986          logRetryCache);
2987      stat = FSDirStatAndListingOp.getFileInfo(dir, src, false,
2988          FSDirectory.isReservedRawName(srcArg), true);
2989    } catch (StandbyException se) {
2990      skipSync = true;
2991      throw se;
2992    } finally {
2993      writeUnlock();
2994      // There might be transactions logged while trying to recover the lease.
2995      // They need to be sync'ed even when an exception was thrown.
2996      if (!skipSync) {
2997        getEditLog().logSync();
2998      }
2999    }
3000    if (lb != null) {
3001      NameNode.stateChangeLog.debug(
3002          "DIR* NameSystem.appendFile: file {} for {} at {} block {} block" +
3003          " size {}", src, holder, clientMachine, lb.getBlock(),
3004          lb.getBlock().getNumBytes());
3005    }
3006    logAuditEvent(true, "append", srcArg);
3007    return new LastBlockWithStatus(lb, stat);
3008  }
3009
3010  ExtendedBlock getExtendedBlock(Block blk) {
3011    return new ExtendedBlock(blockPoolId, blk);
3012  }
3013  
3014  void setBlockPoolId(String bpid) {
3015    blockPoolId = bpid;
3016    blockManager.setBlockPoolId(blockPoolId);
3017  }
3018
3019  /**
3020   * The client would like to obtain an additional block for the indicated
3021   * filename (which is being written-to).  Return an array that consists
3022   * of the block, plus a set of machines.  The first on this list should
3023   * be where the client writes data.  Subsequent items in the list must
3024   * be provided in the connection to the first datanode.
3025   *
3026   * Make sure the previous blocks have been reported by datanodes and
3027   * are replicated.  Will return an empty 2-elt array if we want the
3028   * client to "try again later".
3029   */
3030  LocatedBlock getAdditionalBlock(String src, long fileId, String clientName,
3031      ExtendedBlock previous, Set<Node> excludedNodes, 
3032      List<String> favoredNodes) throws IOException {
3033    LocatedBlock[] onRetryBlock = new LocatedBlock[1];
3034    DatanodeStorageInfo targets[] = getNewBlockTargets(src, fileId,
3035        clientName, previous, excludedNodes, favoredNodes, onRetryBlock);
3036    if (targets == null) {
3037      assert onRetryBlock[0] != null : "Retry block is null";
3038      // This is a retry. Just return the last block.
3039      return onRetryBlock[0];
3040    }
3041    LocatedBlock newBlock = storeAllocatedBlock(
3042        src, fileId, clientName, previous, targets);
3043    return newBlock;
3044  }
3045
3046  /**
3047   * Part I of getAdditionalBlock().
3048   * Analyze the state of the file under read lock to determine if the client
3049   * can add a new block, detect potential retries, lease mismatches,
3050   * and minimal replication of the penultimate block.
3051   * 
3052   * Generate target DataNode locations for the new block,
3053   * but do not create the new block yet.
3054   */
3055  DatanodeStorageInfo[] getNewBlockTargets(String src, long fileId,
3056      String clientName, ExtendedBlock previous, Set<Node> excludedNodes,
3057      List<String> favoredNodes, LocatedBlock[] onRetryBlock) throws IOException {
3058    final long blockSize;
3059    final int replication;
3060    final byte storagePolicyID;
3061    Node clientNode = null;
3062    String clientMachine = null;
3063
3064    NameNode.stateChangeLog.debug("BLOCK* getAdditionalBlock: {}  inodeId {}" +
3065        " for {}", src, fileId, clientName);
3066
3067    checkOperation(OperationCategory.READ);
3068    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3069    FSPermissionChecker pc = getPermissionChecker();
3070    readLock();
3071    try {
3072      checkOperation(OperationCategory.READ);
3073      src = dir.resolvePath(pc, src, pathComponents);
3074      FileState fileState = analyzeFileState(
3075          src, fileId, clientName, previous, onRetryBlock);
3076      final INodeFile pendingFile = fileState.inode;
3077      // Check if the penultimate block is minimally replicated
3078      if (!checkFileProgress(src, pendingFile, false)) {
3079        throw new NotReplicatedYetException("Not replicated yet: " + src);
3080      }
3081      src = fileState.path;
3082
3083      if (onRetryBlock[0] != null && onRetryBlock[0].getLocations().length > 0) {
3084        // This is a retry. No need to generate new locations.
3085        // Use the last block if it has locations.
3086        return null;
3087      }
3088      if (pendingFile.getBlocks().length >= maxBlocksPerFile) {
3089        throw new IOException("File has reached the limit on maximum number of"
3090            + " blocks (" + DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_KEY
3091            + "): " + pendingFile.getBlocks().length + " >= "
3092            + maxBlocksPerFile);
3093      }
3094      blockSize = pendingFile.getPreferredBlockSize();
3095      clientMachine = pendingFile.getFileUnderConstructionFeature()
3096          .getClientMachine();
3097      clientNode = blockManager.getDatanodeManager().getDatanodeByHost(
3098          clientMachine);
3099      replication = pendingFile.getFileReplication();
3100      storagePolicyID = pendingFile.getStoragePolicyID();
3101    } finally {
3102      readUnlock();
3103    }
3104
3105    if (clientNode == null) {
3106      clientNode = getClientNode(clientMachine);
3107    }
3108
3109    // choose targets for the new block to be allocated.
3110    return getBlockManager().chooseTarget4NewBlock( 
3111        src, replication, clientNode, excludedNodes, blockSize, favoredNodes,
3112        storagePolicyID);
3113  }
3114
3115  /**
3116   * Part II of getAdditionalBlock().
3117   * Should repeat the same analysis of the file state as in Part 1,
3118   * but under the write lock.
3119   * If the conditions still hold, then allocate a new block with
3120   * the new targets, add it to the INode and to the BlocksMap.
3121   */
3122  LocatedBlock storeAllocatedBlock(String src, long fileId, String clientName,
3123      ExtendedBlock previous, DatanodeStorageInfo[] targets) throws IOException {
3124    Block newBlock = null;
3125    long offset;
3126    checkOperation(OperationCategory.WRITE);
3127    waitForLoadingFSImage();
3128    writeLock();
3129    try {
3130      checkOperation(OperationCategory.WRITE);
3131      // Run the full analysis again, since things could have changed
3132      // while chooseTarget() was executing.
3133      LocatedBlock[] onRetryBlock = new LocatedBlock[1];
3134      FileState fileState = 
3135          analyzeFileState(src, fileId, clientName, previous, onRetryBlock);
3136      final INodeFile pendingFile = fileState.inode;
3137      src = fileState.path;
3138
3139      if (onRetryBlock[0] != null) {
3140        if (onRetryBlock[0].getLocations().length > 0) {
3141          // This is a retry. Just return the last block if having locations.
3142          return onRetryBlock[0];
3143        } else {
3144          // add new chosen targets to already allocated block and return
3145          BlockInfoContiguous lastBlockInFile = pendingFile.getLastBlock();
3146          ((BlockInfoContiguousUnderConstruction) lastBlockInFile)
3147              .setExpectedLocations(targets);
3148          offset = pendingFile.computeFileSize();
3149          return makeLocatedBlock(lastBlockInFile, targets, offset);
3150        }
3151      }
3152
3153      // commit the last block and complete it if it has minimum replicas
3154      commitOrCompleteLastBlock(pendingFile, fileState.iip,
3155                                ExtendedBlock.getLocalBlock(previous));
3156
3157      // allocate new block, record block locations in INode.
3158      newBlock = createNewBlock();
3159      INodesInPath inodesInPath = INodesInPath.fromINode(pendingFile);
3160      saveAllocatedBlock(src, inodesInPath, newBlock, targets);
3161
3162      persistNewBlock(src, pendingFile);
3163      offset = pendingFile.computeFileSize();
3164    } finally {
3165      writeUnlock();
3166    }
3167    getEditLog().logSync();
3168
3169    // Return located block
3170    return makeLocatedBlock(newBlock, targets, offset);
3171  }
3172
3173  /*
3174   * Resolve clientmachine address to get a network location path
3175   */
3176  private Node getClientNode(String clientMachine) {
3177    List<String> hosts = new ArrayList<String>(1);
3178    hosts.add(clientMachine);
3179    List<String> rName = getBlockManager().getDatanodeManager()
3180        .resolveNetworkLocation(hosts);
3181    Node clientNode = null;
3182    if (rName != null) {
3183      // Able to resolve clientMachine mapping.
3184      // Create a temp node to findout the rack local nodes
3185      clientNode = new NodeBase(rName.get(0) + NodeBase.PATH_SEPARATOR_STR
3186          + clientMachine);
3187    }
3188    return clientNode;
3189  }
3190
3191  static class FileState {
3192    public final INodeFile inode;
3193    public final String path;
3194    public final INodesInPath iip;
3195
3196    public FileState(INodeFile inode, String fullPath, INodesInPath iip) {
3197      this.inode = inode;
3198      this.path = fullPath;
3199      this.iip = iip;
3200    }
3201  }
3202
3203  FileState analyzeFileState(String src,
3204                                long fileId,
3205                                String clientName,
3206                                ExtendedBlock previous,
3207                                LocatedBlock[] onRetryBlock)
3208          throws IOException  {
3209    assert hasReadLock();
3210
3211    checkBlock(previous);
3212    onRetryBlock[0] = null;
3213    checkNameNodeSafeMode("Cannot add block to " + src);
3214
3215    // have we exceeded the configured limit of fs objects.
3216    checkFsObjectLimit();
3217
3218    Block previousBlock = ExtendedBlock.getLocalBlock(previous);
3219    final INode inode;
3220    final INodesInPath iip;
3221    if (fileId == INodeId.GRANDFATHER_INODE_ID) {
3222      // Older clients may not have given us an inode ID to work with.
3223      // In this case, we have to try to resolve the path and hope it
3224      // hasn't changed or been deleted since the file was opened for write.
3225      iip = dir.getINodesInPath4Write(src);
3226      inode = iip.getLastINode();
3227    } else {
3228      // Newer clients pass the inode ID, so we can just get the inode
3229      // directly.
3230      inode = dir.getInode(fileId);
3231      iip = INodesInPath.fromINode(inode);
3232      if (inode != null) {
3233        src = iip.getPath();
3234      }
3235    }
3236    final INodeFile pendingFile = checkLease(src, clientName, inode, fileId);
3237    BlockInfoContiguous lastBlockInFile = pendingFile.getLastBlock();
3238    if (!Block.matchingIdAndGenStamp(previousBlock, lastBlockInFile)) {
3239      // The block that the client claims is the current last block
3240      // doesn't match up with what we think is the last block. There are
3241      // four possibilities:
3242      // 1) This is the first block allocation of an append() pipeline
3243      //    which started appending exactly at or exceeding the block boundary.
3244      //    In this case, the client isn't passed the previous block,
3245      //    so it makes the allocateBlock() call with previous=null.
3246      //    We can distinguish this since the last block of the file
3247      //    will be exactly a full block.
3248      // 2) This is a retry from a client that missed the response of a
3249      //    prior getAdditionalBlock() call, perhaps because of a network
3250      //    timeout, or because of an HA failover. In that case, we know
3251      //    by the fact that the client is re-issuing the RPC that it
3252      //    never began to write to the old block. Hence it is safe to
3253      //    to return the existing block.
3254      // 3) This is an entirely bogus request/bug -- we should error out
3255      //    rather than potentially appending a new block with an empty
3256      //    one in the middle, etc
3257      // 4) This is a retry from a client that timed out while
3258      //    the prior getAdditionalBlock() is still being processed,
3259      //    currently working on chooseTarget(). 
3260      //    There are no means to distinguish between the first and 
3261      //    the second attempts in Part I, because the first one hasn't
3262      //    changed the namesystem state yet.
3263      //    We run this analysis again in Part II where case 4 is impossible.
3264
3265      BlockInfoContiguous penultimateBlock = pendingFile.getPenultimateBlock();
3266      if (previous == null &&
3267          lastBlockInFile != null &&
3268          lastBlockInFile.getNumBytes() >= pendingFile.getPreferredBlockSize() &&
3269          lastBlockInFile.isComplete()) {
3270        // Case 1
3271        NameNode.stateChangeLog.debug(
3272            "BLOCK* NameSystem.allocateBlock: handling block allocation" +
3273            " writing to a file with a complete previous block: src={}" +
3274            " lastBlock={}", src, lastBlockInFile);
3275      } else if (Block.matchingIdAndGenStamp(penultimateBlock, previousBlock)) {
3276        if (lastBlockInFile.getNumBytes() != 0) {
3277          throw new IOException(
3278              "Request looked like a retry to allocate block " +
3279              lastBlockInFile + " but it already contains " +
3280              lastBlockInFile.getNumBytes() + " bytes");
3281        }
3282
3283        // Case 2
3284        // Return the last block.
3285        NameNode.stateChangeLog.info("BLOCK* allocateBlock: " +
3286            "caught retry for allocation of a new block in " +
3287            src + ". Returning previously allocated block " + lastBlockInFile);
3288        long offset = pendingFile.computeFileSize();
3289        onRetryBlock[0] = makeLocatedBlock(lastBlockInFile,
3290            ((BlockInfoContiguousUnderConstruction)lastBlockInFile).getExpectedStorageLocations(),
3291            offset);
3292        return new FileState(pendingFile, src, iip);
3293      } else {
3294        // Case 3
3295        throw new IOException("Cannot allocate block in " + src + ": " +
3296            "passed 'previous' block " + previous + " does not match actual " +
3297            "last block in file " + lastBlockInFile);
3298      }
3299    }
3300    return new FileState(pendingFile, src, iip);
3301  }
3302
3303  LocatedBlock makeLocatedBlock(Block blk, DatanodeStorageInfo[] locs,
3304                                        long offset) throws IOException {
3305    LocatedBlock lBlk = new LocatedBlock(
3306        getExtendedBlock(blk), locs, offset, false);
3307    getBlockManager().setBlockToken(
3308        lBlk, BlockTokenSecretManager.AccessMode.WRITE);
3309    return lBlk;
3310  }
3311
3312  /** @see ClientProtocol#getAdditionalDatanode */
3313  LocatedBlock getAdditionalDatanode(String src, long fileId,
3314      final ExtendedBlock blk, final DatanodeInfo[] existings,
3315      final String[] storageIDs,
3316      final Set<Node> excludes,
3317      final int numAdditionalNodes, final String clientName
3318      ) throws IOException {
3319    //check if the feature is enabled
3320    dtpReplaceDatanodeOnFailure.checkEnabled();
3321
3322    Node clientnode = null;
3323    String clientMachine;
3324    final long preferredblocksize;
3325    final byte storagePolicyID;
3326    final List<DatanodeStorageInfo> chosen;
3327    checkOperation(OperationCategory.READ);
3328    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3329    FSPermissionChecker pc = getPermissionChecker();
3330    readLock();
3331    try {
3332      checkOperation(OperationCategory.READ);
3333      //check safe mode
3334      checkNameNodeSafeMode("Cannot add datanode; src=" + src + ", blk=" + blk);
3335      src = dir.resolvePath(pc, src, pathComponents);
3336
3337      //check lease
3338      final INode inode;
3339      if (fileId == INodeId.GRANDFATHER_INODE_ID) {
3340        // Older clients may not have given us an inode ID to work with.
3341        // In this case, we have to try to resolve the path and hope it
3342        // hasn't changed or been deleted since the file was opened for write.
3343        inode = dir.getINode(src);
3344      } else {
3345        inode = dir.getInode(fileId);
3346        if (inode != null) src = inode.getFullPathName();
3347      }
3348      final INodeFile file = checkLease(src, clientName, inode, fileId);
3349      clientMachine = file.getFileUnderConstructionFeature().getClientMachine();
3350      clientnode = blockManager.getDatanodeManager().getDatanodeByHost(clientMachine);
3351      preferredblocksize = file.getPreferredBlockSize();
3352      storagePolicyID = file.getStoragePolicyID();
3353
3354      //find datanode storages
3355      final DatanodeManager dm = blockManager.getDatanodeManager();
3356      chosen = Arrays.asList(dm.getDatanodeStorageInfos(existings, storageIDs));
3357    } finally {
3358      readUnlock();
3359    }
3360
3361    if (clientnode == null) {
3362      clientnode = getClientNode(clientMachine);
3363    }
3364
3365    // choose new datanodes.
3366    final DatanodeStorageInfo[] targets = blockManager.chooseTarget4AdditionalDatanode(
3367        src, numAdditionalNodes, clientnode, chosen, 
3368        excludes, preferredblocksize, storagePolicyID);
3369    final LocatedBlock lb = new LocatedBlock(blk, targets);
3370    blockManager.setBlockToken(lb, AccessMode.COPY);
3371    return lb;
3372  }
3373
3374  /**
3375   * The client would like to let go of the given block
3376   */
3377  boolean abandonBlock(ExtendedBlock b, long fileId, String src, String holder)
3378      throws IOException {
3379    NameNode.stateChangeLog.debug(
3380        "BLOCK* NameSystem.abandonBlock: {} of file {}", b, src);
3381    checkOperation(OperationCategory.WRITE);
3382    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3383    FSPermissionChecker pc = getPermissionChecker();
3384    waitForLoadingFSImage();
3385    writeLock();
3386    try {
3387      checkOperation(OperationCategory.WRITE);
3388      checkNameNodeSafeMode("Cannot abandon block " + b + " for file" + src);
3389      src = dir.resolvePath(pc, src, pathComponents);
3390
3391      final INode inode;
3392      final INodesInPath iip;
3393      if (fileId == INodeId.GRANDFATHER_INODE_ID) {
3394        // Older clients may not have given us an inode ID to work with.
3395        // In this case, we have to try to resolve the path and hope it
3396        // hasn't changed or been deleted since the file was opened for write.
3397        iip = dir.getINodesInPath(src, true);
3398        inode = iip.getLastINode();
3399      } else {
3400        inode = dir.getInode(fileId);
3401        iip = INodesInPath.fromINode(inode);
3402        if (inode != null) {
3403          src = iip.getPath();
3404        }
3405      }
3406      final INodeFile file = checkLease(src, holder, inode, fileId);
3407
3408      // Remove the block from the pending creates list
3409      boolean removed = dir.removeBlock(src, iip, file,
3410          ExtendedBlock.getLocalBlock(b));
3411      if (!removed) {
3412        return true;
3413      }
3414      NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: {} is " +
3415          "removed from pendingCreates", b);
3416      persistBlocks(src, file, false);
3417    } finally {
3418      writeUnlock();
3419    }
3420    getEditLog().logSync();
3421
3422    return true;
3423  }
3424
3425  private INodeFile checkLease(String src, String holder, INode inode,
3426      long fileId) throws LeaseExpiredException, FileNotFoundException {
3427    assert hasReadLock();
3428    final String ident = src + " (inode " + fileId + ")";
3429    if (inode == null) {
3430      Lease lease = leaseManager.getLease(holder);
3431      throw new LeaseExpiredException(
3432          "No lease on " + ident + ": File does not exist. "
3433          + (lease != null ? lease.toString()
3434              : "Holder " + holder + " does not have any open files."));
3435    }
3436    if (!inode.isFile()) {
3437      Lease lease = leaseManager.getLease(holder);
3438      throw new LeaseExpiredException(
3439          "No lease on " + ident + ": INode is not a regular file. "
3440              + (lease != null ? lease.toString()
3441              : "Holder " + holder + " does not have any open files."));
3442    }
3443    final INodeFile file = inode.asFile();
3444    if (!file.isUnderConstruction()) {
3445      Lease lease = leaseManager.getLease(holder);
3446      throw new LeaseExpiredException(
3447          "No lease on " + ident + ": File is not open for writing. "
3448          + (lease != null ? lease.toString()
3449              : "Holder " + holder + " does not have any open files."));
3450    }
3451    // No further modification is allowed on a deleted file.
3452    // A file is considered deleted, if it is not in the inodeMap or is marked
3453    // as deleted in the snapshot feature.
3454    if (isFileDeleted(file)) {
3455      throw new FileNotFoundException(src);
3456    }
3457    String clientName = file.getFileUnderConstructionFeature().getClientName();
3458    if (holder != null && !clientName.equals(holder)) {
3459      throw new LeaseExpiredException("Lease mismatch on " + ident +
3460          " owned by " + clientName + " but is accessed by " + holder);
3461    }
3462    return file;
3463  }
3464 
3465  /**
3466   * Complete in-progress write to the given file.
3467   * @return true if successful, false if the client should continue to retry
3468   *         (e.g if not all blocks have reached minimum replication yet)
3469   * @throws IOException on error (eg lease mismatch, file not open, file deleted)
3470   */
3471  boolean completeFile(final String srcArg, String holder,
3472                       ExtendedBlock last, long fileId)
3473    throws SafeModeException, UnresolvedLinkException, IOException {
3474    String src = srcArg;
3475    NameNode.stateChangeLog.debug("DIR* NameSystem.completeFile: {} for {}",
3476        src, holder);
3477    checkBlock(last);
3478    boolean success = false;
3479    checkOperation(OperationCategory.WRITE);
3480    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3481    FSPermissionChecker pc = getPermissionChecker();
3482    waitForLoadingFSImage();
3483    writeLock();
3484    try {
3485      checkOperation(OperationCategory.WRITE);
3486      checkNameNodeSafeMode("Cannot complete file " + src);
3487      src = dir.resolvePath(pc, src, pathComponents);
3488      success = completeFileInternal(src, holder,
3489        ExtendedBlock.getLocalBlock(last), fileId);
3490    } finally {
3491      writeUnlock();
3492    }
3493    getEditLog().logSync();
3494    if (success) {
3495      NameNode.stateChangeLog.info("DIR* completeFile: " + srcArg
3496          + " is closed by " + holder);
3497    }
3498    return success;
3499  }
3500
3501  private boolean completeFileInternal(String src, String holder, Block last,
3502      long fileId) throws IOException {
3503    assert hasWriteLock();
3504    final INodeFile pendingFile;
3505    final INodesInPath iip;
3506    INode inode = null;
3507    try {
3508      if (fileId == INodeId.GRANDFATHER_INODE_ID) {
3509        // Older clients may not have given us an inode ID to work with.
3510        // In this case, we have to try to resolve the path and hope it
3511        // hasn't changed or been deleted since the file was opened for write.
3512        iip = dir.getINodesInPath(src, true);
3513        inode = iip.getLastINode();
3514      } else {
3515        inode = dir.getInode(fileId);
3516        iip = INodesInPath.fromINode(inode);
3517        if (inode != null) {
3518          src = iip.getPath();
3519        }
3520      }
3521      pendingFile = checkLease(src, holder, inode, fileId);
3522    } catch (LeaseExpiredException lee) {
3523      if (inode != null && inode.isFile() &&
3524          !inode.asFile().isUnderConstruction()) {
3525        // This could be a retry RPC - i.e the client tried to close
3526        // the file, but missed the RPC response. Thus, it is trying
3527        // again to close the file. If the file still exists and
3528        // the client's view of the last block matches the actual
3529        // last block, then we'll treat it as a successful close.
3530        // See HDFS-3031.
3531        final Block realLastBlock = inode.asFile().getLastBlock();
3532        if (Block.matchingIdAndGenStamp(last, realLastBlock)) {
3533          NameNode.stateChangeLog.info("DIR* completeFile: " +
3534              "request from " + holder + " to complete inode " + fileId +
3535              "(" + src + ") which is already closed. But, it appears to be " +
3536              "an RPC retry. Returning success");
3537          return true;
3538        }
3539      }
3540      throw lee;
3541    }
3542    // Check the state of the penultimate block. It should be completed
3543    // before attempting to complete the last one.
3544    if (!checkFileProgress(src, pendingFile, false)) {
3545      return false;
3546    }
3547
3548    // commit the last block and complete it if it has minimum replicas
3549    commitOrCompleteLastBlock(pendingFile, iip, last);
3550
3551    if (!checkFileProgress(src, pendingFile, true)) {
3552      return false;
3553    }
3554
3555    finalizeINodeFileUnderConstruction(src, pendingFile,
3556        Snapshot.CURRENT_STATE_ID);
3557    return true;
3558  }
3559
3560  /**
3561   * Save allocated block at the given pending filename
3562   * 
3563   * @param src path to the file
3564   * @param inodesInPath representing each of the components of src.
3565   *                     The last INode is the INode for {@code src} file.
3566   * @param newBlock newly allocated block to be save
3567   * @param targets target datanodes where replicas of the new block is placed
3568   * @throws QuotaExceededException If addition of block exceeds space quota
3569   */
3570  BlockInfoContiguous saveAllocatedBlock(String src, INodesInPath inodesInPath,
3571      Block newBlock, DatanodeStorageInfo[] targets)
3572          throws IOException {
3573    assert hasWriteLock();
3574    BlockInfoContiguous b = dir.addBlock(src, inodesInPath, newBlock, targets);
3575    NameNode.stateChangeLog.info("BLOCK* allocate " + b + " for " + src);
3576    DatanodeStorageInfo.incrementBlocksScheduled(targets);
3577    return b;
3578  }
3579
3580  /**
3581   * Create new block with a unique block id and a new generation stamp.
3582   */
3583  Block createNewBlock() throws IOException {
3584    assert hasWriteLock();
3585    Block b = new Block(nextBlockId(), 0, 0);
3586    // Increment the generation stamp for every new block.
3587    b.setGenerationStamp(nextGenerationStamp(false));
3588    return b;
3589  }
3590
3591  /**
3592   * Check that the indicated file's blocks are present and
3593   * replicated.  If not, return false. If checkall is true, then check
3594   * all blocks, otherwise check only penultimate block.
3595   */
3596  boolean checkFileProgress(String src, INodeFile v, boolean checkall) {
3597    if (checkall) {
3598      // check all blocks of the file.
3599      for (BlockInfoContiguous block: v.getBlocks()) {
3600        if (!isCompleteBlock(src, block, blockManager.minReplication)) {
3601          return false;
3602        }
3603      }
3604    } else {
3605      // check the penultimate block of this file
3606      BlockInfoContiguous b = v.getPenultimateBlock();
3607      if (b != null
3608          && !isCompleteBlock(src, b, blockManager.minReplication)) {
3609        return false;
3610      }
3611    }
3612    return true;
3613  }
3614
3615  private static boolean isCompleteBlock(String src, BlockInfoContiguous b, int minRepl) {
3616    if (!b.isComplete()) {
3617      final BlockInfoContiguousUnderConstruction uc = (BlockInfoContiguousUnderConstruction)b;
3618      final int numNodes = b.numNodes();
3619      LOG.info("BLOCK* " + b + " is not COMPLETE (ucState = "
3620          + uc.getBlockUCState() + ", replication# = " + numNodes
3621          + (numNodes < minRepl? " < ": " >= ")
3622          + " minimum = " + minRepl + ") in file " + src);
3623      return false;
3624    }
3625    return true;
3626  }
3627
3628  ////////////////////////////////////////////////////////////////
3629  // Here's how to handle block-copy failure during client write:
3630  // -- As usual, the client's write should result in a streaming
3631  // backup write to a k-machine sequence.
3632  // -- If one of the backup machines fails, no worries.  Fail silently.
3633  // -- Before client is allowed to close and finalize file, make sure
3634  // that the blocks are backed up.  Namenode may have to issue specific backup
3635  // commands to make up for earlier datanode failures.  Once all copies
3636  // are made, edit namespace and return to client.
3637  ////////////////////////////////////////////////////////////////
3638
3639  /** 
3640   * Change the indicated filename. 
3641   * @deprecated Use {@link #renameTo(String, String, boolean,
3642   * Options.Rename...)} instead.
3643   */
3644  @Deprecated
3645  boolean renameTo(String src, String dst, boolean logRetryCache)
3646      throws IOException {
3647    waitForLoadingFSImage();
3648    checkOperation(OperationCategory.WRITE);
3649    FSDirRenameOp.RenameOldResult ret = null;
3650    writeLock();
3651    try {
3652      checkOperation(OperationCategory.WRITE);
3653      checkNameNodeSafeMode("Cannot rename " + src);
3654      ret = FSDirRenameOp.renameToInt(dir, src, dst, logRetryCache);
3655    } catch (AccessControlException e)  {
3656      logAuditEvent(false, "rename", src, dst, null);
3657      throw e;
3658    } finally {
3659      writeUnlock();
3660    }
3661    boolean success = ret != null && ret.success;
3662    if (success) {
3663      getEditLog().logSync();
3664    }
3665    logAuditEvent(success, "rename", src, dst,
3666        ret == null ? null : ret.auditStat);
3667    return success;
3668  }
3669
3670  void renameTo(final String src, final String dst,
3671                boolean logRetryCache, Options.Rename... options)
3672      throws IOException {
3673    waitForLoadingFSImage();
3674    checkOperation(OperationCategory.WRITE);
3675    Map.Entry<BlocksMapUpdateInfo, HdfsFileStatus> res = null;
3676    writeLock();
3677    try {
3678      checkOperation(OperationCategory.WRITE);
3679      checkNameNodeSafeMode("Cannot rename " + src);
3680      res = FSDirRenameOp.renameToInt(dir, src, dst, logRetryCache, options);
3681    } catch (AccessControlException e) {
3682      logAuditEvent(false, "rename (options=" + Arrays.toString(options) +
3683          ")", src, dst, null);
3684      throw e;
3685    } finally {
3686      writeUnlock();
3687    }
3688
3689    getEditLog().logSync();
3690
3691    BlocksMapUpdateInfo collectedBlocks = res.getKey();
3692    HdfsFileStatus auditStat = res.getValue();
3693    if (!collectedBlocks.getToDeleteList().isEmpty()) {
3694      removeBlocks(collectedBlocks);
3695      collectedBlocks.clear();
3696    }
3697
3698    logAuditEvent(true, "rename (options=" + Arrays.toString(options) +
3699        ")", src, dst, auditStat);
3700  }
3701
3702  /**
3703   * Remove the indicated file from namespace.
3704   * 
3705   * @see ClientProtocol#delete(String, boolean) for detailed description and 
3706   * description of exceptions
3707   */
3708  boolean delete(String src, boolean recursive, boolean logRetryCache)
3709      throws IOException {
3710    waitForLoadingFSImage();
3711    checkOperation(OperationCategory.WRITE);
3712    BlocksMapUpdateInfo toRemovedBlocks = null;
3713    writeLock();
3714    boolean ret = false;
3715    try {
3716      checkOperation(OperationCategory.WRITE);
3717      checkNameNodeSafeMode("Cannot delete " + src);
3718      toRemovedBlocks = FSDirDeleteOp.delete(
3719          this, src, recursive, logRetryCache);
3720      ret = toRemovedBlocks != null;
3721    } catch (AccessControlException e) {
3722      logAuditEvent(false, "delete", src);
3723      throw e;
3724    } finally {
3725      writeUnlock();
3726    }
3727    getEditLog().logSync();
3728    if (toRemovedBlocks != null) {
3729      removeBlocks(toRemovedBlocks); // Incremental deletion of blocks
3730    }
3731    logAuditEvent(true, "delete", src);
3732    return ret;
3733  }
3734
3735  FSPermissionChecker getPermissionChecker()
3736      throws AccessControlException {
3737    return dir.getPermissionChecker();
3738  }
3739
3740  /**
3741   * From the given list, incrementally remove the blocks from blockManager
3742   * Writelock is dropped and reacquired every BLOCK_DELETION_INCREMENT to
3743   * ensure that other waiters on the lock can get in. See HDFS-2938
3744   * 
3745   * @param blocks
3746   *          An instance of {@link BlocksMapUpdateInfo} which contains a list
3747   *          of blocks that need to be removed from blocksMap
3748   */
3749  void removeBlocks(BlocksMapUpdateInfo blocks) {
3750    List<Block> toDeleteList = blocks.getToDeleteList();
3751    Iterator<Block> iter = toDeleteList.iterator();
3752    while (iter.hasNext()) {
3753      writeLock();
3754      try {
3755        for (int i = 0; i < BLOCK_DELETION_INCREMENT && iter.hasNext(); i++) {
3756          blockManager.removeBlock(iter.next());
3757        }
3758      } finally {
3759        writeUnlock();
3760      }
3761    }
3762  }
3763  
3764  /**
3765   * Remove leases and inodes related to a given path
3766   * @param src The given path
3767   * @param removedINodes Containing the list of inodes to be removed from
3768   *                      inodesMap
3769   * @param acquireINodeMapLock Whether to acquire the lock for inode removal
3770   */
3771  void removeLeasesAndINodes(String src, List<INode> removedINodes,
3772      final boolean acquireINodeMapLock) {
3773    assert hasWriteLock();
3774    leaseManager.removeLeaseWithPrefixPath(src);
3775    // remove inodes from inodesMap
3776    if (removedINodes != null) {
3777      if (acquireINodeMapLock) {
3778        dir.writeLock();
3779      }
3780      try {
3781        dir.removeFromInodeMap(removedINodes);
3782      } finally {
3783        if (acquireINodeMapLock) {
3784          dir.writeUnlock();
3785        }
3786      }
3787      removedINodes.clear();
3788    }
3789  }
3790
3791  /**
3792   * Removes the blocks from blocksmap and updates the safemode blocks total
3793   * 
3794   * @param blocks
3795   *          An instance of {@link BlocksMapUpdateInfo} which contains a list
3796   *          of blocks that need to be removed from blocksMap
3797   */
3798  void removeBlocksAndUpdateSafemodeTotal(BlocksMapUpdateInfo blocks) {
3799    assert hasWriteLock();
3800    // In the case that we are a Standby tailing edits from the
3801    // active while in safe-mode, we need to track the total number
3802    // of blocks and safe blocks in the system.
3803    boolean trackBlockCounts = isSafeModeTrackingBlocks();
3804    int numRemovedComplete = 0, numRemovedSafe = 0;
3805
3806    for (Block b : blocks.getToDeleteList()) {
3807      if (trackBlockCounts) {
3808        BlockInfoContiguous bi = getStoredBlock(b);
3809        if (bi.isComplete()) {
3810          numRemovedComplete++;
3811          if (bi.numNodes() >= blockManager.minReplication) {
3812            numRemovedSafe++;
3813          }
3814        }
3815      }
3816      blockManager.removeBlock(b);
3817    }
3818    if (trackBlockCounts) {
3819      if (LOG.isDebugEnabled()) {
3820        LOG.debug("Adjusting safe-mode totals for deletion."
3821            + "decreasing safeBlocks by " + numRemovedSafe
3822            + ", totalBlocks by " + numRemovedComplete);
3823      }
3824      adjustSafeModeBlockTotals(-numRemovedSafe, -numRemovedComplete);
3825    }
3826  }
3827
3828  /**
3829   * @see SafeModeInfo#shouldIncrementallyTrackBlocks
3830   */
3831  private boolean isSafeModeTrackingBlocks() {
3832    if (!haEnabled) {
3833      // Never track blocks incrementally in non-HA code.
3834      return false;
3835    }
3836    SafeModeInfo sm = this.safeMode;
3837    return sm != null && sm.shouldIncrementallyTrackBlocks();
3838  }
3839
3840  /**
3841   * Get the file info for a specific file.
3842   *
3843   * @param src The string representation of the path to the file
3844   * @param resolveLink whether to throw UnresolvedLinkException
3845   *        if src refers to a symlink
3846   *
3847   * @throws AccessControlException if access is denied
3848   * @throws UnresolvedLinkException if a symlink is encountered.
3849   *
3850   * @return object containing information regarding the file
3851   *         or null if file not found
3852   * @throws StandbyException
3853   */
3854  HdfsFileStatus getFileInfo(final String src, boolean resolveLink)
3855    throws IOException {
3856    checkOperation(OperationCategory.READ);
3857    HdfsFileStatus stat = null;
3858    readLock();
3859    try {
3860      checkOperation(OperationCategory.READ);
3861      stat = FSDirStatAndListingOp.getFileInfo(dir, src, resolveLink);
3862    } catch (AccessControlException e) {
3863      logAuditEvent(false, "getfileinfo", src);
3864      throw e;
3865    } finally {
3866      readUnlock();
3867    }
3868    logAuditEvent(true, "getfileinfo", src);
3869    return stat;
3870  }
3871
3872  /**
3873   * Returns true if the file is closed
3874   */
3875  boolean isFileClosed(final String src) throws IOException {
3876    checkOperation(OperationCategory.READ);
3877    readLock();
3878    try {
3879      checkOperation(OperationCategory.READ);
3880      return FSDirStatAndListingOp.isFileClosed(dir, src);
3881    } catch (AccessControlException e) {
3882      logAuditEvent(false, "isFileClosed", src);
3883      throw e;
3884    } finally {
3885      readUnlock();
3886    }
3887  }
3888
3889  /**
3890   * Create all the necessary directories
3891   */
3892  boolean mkdirs(String src, PermissionStatus permissions,
3893      boolean createParent) throws IOException {
3894    HdfsFileStatus auditStat = null;
3895    checkOperation(OperationCategory.WRITE);
3896    writeLock();
3897    try {
3898      checkOperation(OperationCategory.WRITE);
3899      checkNameNodeSafeMode("Cannot create directory " + src);
3900      auditStat = FSDirMkdirOp.mkdirs(this, src, permissions, createParent);
3901    } catch (AccessControlException e) {
3902      logAuditEvent(false, "mkdirs", src);
3903      throw e;
3904    } finally {
3905      writeUnlock();
3906    }
3907    getEditLog().logSync();
3908    logAuditEvent(true, "mkdirs", src, null, auditStat);
3909    return true;
3910  }
3911
3912  /**
3913   * Get the content summary for a specific file/dir.
3914   *
3915   * @param src The string representation of the path to the file
3916   *
3917   * @throws AccessControlException if access is denied
3918   * @throws UnresolvedLinkException if a symlink is encountered.
3919   * @throws FileNotFoundException if no file exists
3920   * @throws StandbyException
3921   * @throws IOException for issues with writing to the audit log
3922   *
3923   * @return object containing information regarding the file
3924   *         or null if file not found
3925   */
3926  ContentSummary getContentSummary(final String src) throws IOException {
3927    readLock();
3928    boolean success = true;
3929    try {
3930      return FSDirStatAndListingOp.getContentSummary(dir, src);
3931    } catch (AccessControlException ace) {
3932      success = false;
3933      throw ace;
3934    } finally {
3935      readUnlock();
3936      logAuditEvent(success, "contentSummary", src);
3937    }
3938  }
3939
3940  /**
3941   * Set the namespace quota and storage space quota for a directory.
3942   * See {@link ClientProtocol#setQuota(String, long, long, StorageType)} for the
3943   * contract.
3944   * 
3945   * Note: This does not support ".inodes" relative path.
3946   */
3947  void setQuota(String src, long nsQuota, long ssQuota, StorageType type)
3948      throws IOException {
3949    checkOperation(OperationCategory.WRITE);
3950    writeLock();
3951    boolean success = false;
3952    try {
3953      checkOperation(OperationCategory.WRITE);
3954      checkNameNodeSafeMode("Cannot set quota on " + src);
3955      FSDirAttrOp.setQuota(dir, src, nsQuota, ssQuota, type);
3956      success = true;
3957    } finally {
3958      writeUnlock();
3959      if (success) {
3960        getEditLog().logSync();
3961      }
3962      logAuditEvent(success, "setQuota", src);
3963    }
3964  }
3965
3966  /** Persist all metadata about this file.
3967   * @param src The string representation of the path
3968   * @param fileId The inode ID that we're fsyncing.  Older clients will pass
3969   *               INodeId.GRANDFATHER_INODE_ID here.
3970   * @param clientName The string representation of the client
3971   * @param lastBlockLength The length of the last block 
3972   *                        under construction reported from client.
3973   * @throws IOException if path does not exist
3974   */
3975  void fsync(String src, long fileId, String clientName, long lastBlockLength)
3976      throws IOException {
3977    NameNode.stateChangeLog.info("BLOCK* fsync: " + src + " for " + clientName);
3978    checkOperation(OperationCategory.WRITE);
3979    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3980
3981    FSPermissionChecker pc = getPermissionChecker();
3982    waitForLoadingFSImage();
3983    writeLock();
3984    try {
3985      checkOperation(OperationCategory.WRITE);
3986      checkNameNodeSafeMode("Cannot fsync file " + src);
3987      src = dir.resolvePath(pc, src, pathComponents);
3988      final INode inode;
3989      if (fileId == INodeId.GRANDFATHER_INODE_ID) {
3990        // Older clients may not have given us an inode ID to work with.
3991        // In this case, we have to try to resolve the path and hope it
3992        // hasn't changed or been deleted since the file was opened for write.
3993        inode = dir.getINode(src);
3994      } else {
3995        inode = dir.getInode(fileId);
3996        if (inode != null) src = inode.getFullPathName();
3997      }
3998      final INodeFile pendingFile = checkLease(src, clientName, inode, fileId);
3999      if (lastBlockLength > 0) {
4000        pendingFile.getFileUnderConstructionFeature().updateLengthOfLastBlock(
4001            pendingFile, lastBlockLength);
4002      }
4003      persistBlocks(src, pendingFile, false);
4004    } finally {
4005      writeUnlock();
4006    }
4007    getEditLog().logSync();
4008  }
4009
4010  /**
4011   * Move a file that is being written to be immutable.
4012   * @param src The filename
4013   * @param lease The lease for the client creating the file
4014   * @param recoveryLeaseHolder reassign lease to this holder if the last block
4015   *        needs recovery; keep current holder if null.
4016   * @throws AlreadyBeingCreatedException if file is waiting to achieve minimal
4017   *         replication;<br>
4018   *         RecoveryInProgressException if lease recovery is in progress.<br>
4019   *         IOException in case of an error.
4020   * @return true  if file has been successfully finalized and closed or 
4021   *         false if block recovery has been initiated. Since the lease owner
4022   *         has been changed and logged, caller should call logSync().
4023   */
4024  boolean internalReleaseLease(Lease lease, String src, INodesInPath iip,
4025      String recoveryLeaseHolder) throws IOException {
4026    LOG.info("Recovering " + lease + ", src=" + src);
4027    assert !isInSafeMode();
4028    assert hasWriteLock();
4029
4030    final INodeFile pendingFile = iip.getLastINode().asFile();
4031    int nrBlocks = pendingFile.numBlocks();
4032    BlockInfoContiguous[] blocks = pendingFile.getBlocks();
4033
4034    int nrCompleteBlocks;
4035    BlockInfoContiguous curBlock = null;
4036    for(nrCompleteBlocks = 0; nrCompleteBlocks < nrBlocks; nrCompleteBlocks++) {
4037      curBlock = blocks[nrCompleteBlocks];
4038      if(!curBlock.isComplete())
4039        break;
4040      assert blockManager.checkMinReplication(curBlock) :
4041              "A COMPLETE block is not minimally replicated in " + src;
4042    }
4043
4044    // If there are no incomplete blocks associated with this file,
4045    // then reap lease immediately and close the file.
4046    if(nrCompleteBlocks == nrBlocks) {
4047      finalizeINodeFileUnderConstruction(src, pendingFile,
4048          iip.getLatestSnapshotId());
4049      NameNode.stateChangeLog.warn("BLOCK*"
4050        + " internalReleaseLease: All existing blocks are COMPLETE,"
4051        + " lease removed, file closed.");
4052      return true;  // closed!
4053    }
4054
4055    // Only the last and the penultimate blocks may be in non COMPLETE state.
4056    // If the penultimate block is not COMPLETE, then it must be COMMITTED.
4057    if(nrCompleteBlocks < nrBlocks - 2 ||
4058       nrCompleteBlocks == nrBlocks - 2 &&
4059         curBlock != null &&
4060         curBlock.getBlockUCState() != BlockUCState.COMMITTED) {
4061      final String message = "DIR* NameSystem.internalReleaseLease: "
4062        + "attempt to release a create lock on "
4063        + src + " but file is already closed.";
4064      NameNode.stateChangeLog.warn(message);
4065      throw new IOException(message);
4066    }
4067
4068    // The last block is not COMPLETE, and
4069    // that the penultimate block if exists is either COMPLETE or COMMITTED
4070    final BlockInfoContiguous lastBlock = pendingFile.getLastBlock();
4071    BlockUCState lastBlockState = lastBlock.getBlockUCState();
4072    BlockInfoContiguous penultimateBlock = pendingFile.getPenultimateBlock();
4073
4074    // If penultimate block doesn't exist then its minReplication is met
4075    boolean penultimateBlockMinReplication = penultimateBlock == null ? true :
4076        blockManager.checkMinReplication(penultimateBlock);
4077
4078    switch(lastBlockState) {
4079    case COMPLETE:
4080      assert false : "Already checked that the last block is incomplete";
4081      break;
4082    case COMMITTED:
4083      // Close file if committed blocks are minimally replicated
4084      if(penultimateBlockMinReplication &&
4085          blockManager.checkMinReplication(lastBlock)) {
4086        finalizeINodeFileUnderConstruction(src, pendingFile,
4087            iip.getLatestSnapshotId());
4088        NameNode.stateChangeLog.warn("BLOCK*"
4089          + " internalReleaseLease: Committed blocks are minimally replicated,"
4090          + " lease removed, file closed.");
4091        return true;  // closed!
4092      }
4093      // Cannot close file right now, since some blocks 
4094      // are not yet minimally replicated.
4095      // This may potentially cause infinite loop in lease recovery
4096      // if there are no valid replicas on data-nodes.
4097      String message = "DIR* NameSystem.internalReleaseLease: " +
4098          "Failed to release lease for file " + src +
4099          ". Committed blocks are waiting to be minimally replicated." +
4100          " Try again later.";
4101      NameNode.stateChangeLog.warn(message);
4102      throw new AlreadyBeingCreatedException(message);
4103    case UNDER_CONSTRUCTION:
4104    case UNDER_RECOVERY:
4105      final BlockInfoContiguousUnderConstruction uc = (BlockInfoContiguousUnderConstruction)lastBlock;
4106      // determine if last block was intended to be truncated
4107      Block recoveryBlock = uc.getTruncateBlock();
4108      boolean truncateRecovery = recoveryBlock != null;
4109      boolean copyOnTruncate = truncateRecovery &&
4110          recoveryBlock.getBlockId() != uc.getBlockId();
4111      assert !copyOnTruncate ||
4112          recoveryBlock.getBlockId() < uc.getBlockId() &&
4113          recoveryBlock.getGenerationStamp() < uc.getGenerationStamp() &&
4114          recoveryBlock.getNumBytes() > uc.getNumBytes() :
4115            "wrong recoveryBlock";
4116
4117      // setup the last block locations from the blockManager if not known
4118      if (uc.getNumExpectedLocations() == 0) {
4119        uc.setExpectedLocations(blockManager.getStorages(lastBlock));
4120      }
4121
4122      if (uc.getNumExpectedLocations() == 0 && uc.getNumBytes() == 0) {
4123        // There is no datanode reported to this block.
4124        // may be client have crashed before writing data to pipeline.
4125        // This blocks doesn't need any recovery.
4126        // We can remove this block and close the file.
4127        pendingFile.removeLastBlock(lastBlock);
4128        finalizeINodeFileUnderConstruction(src, pendingFile,
4129            iip.getLatestSnapshotId());
4130        NameNode.stateChangeLog.warn("BLOCK* internalReleaseLease: "
4131            + "Removed empty last block and closed file.");
4132        return true;
4133      }
4134      // start recovery of the last block for this file
4135      long blockRecoveryId = nextGenerationStamp(blockIdManager.isLegacyBlock(uc));
4136      lease = reassignLease(lease, src, recoveryLeaseHolder, pendingFile);
4137      if(copyOnTruncate) {
4138        uc.setGenerationStamp(blockRecoveryId);
4139      } else if(truncateRecovery) {
4140        recoveryBlock.setGenerationStamp(blockRecoveryId);
4141      }
4142      uc.initializeBlockRecovery(blockRecoveryId);
4143      leaseManager.renewLease(lease);
4144      // Cannot close file right now, since the last block requires recovery.
4145      // This may potentially cause infinite loop in lease recovery
4146      // if there are no valid replicas on data-nodes.
4147      NameNode.stateChangeLog.warn(
4148                "DIR* NameSystem.internalReleaseLease: " +
4149                "File " + src + " has not been closed." +
4150               " Lease recovery is in progress. " +
4151                "RecoveryId = " + blockRecoveryId + " for block " + lastBlock);
4152      break;
4153    }
4154    return false;
4155  }
4156
4157  private Lease reassignLease(Lease lease, String src, String newHolder,
4158      INodeFile pendingFile) {
4159    assert hasWriteLock();
4160    if(newHolder == null)
4161      return lease;
4162    // The following transaction is not synced. Make sure it's sync'ed later.
4163    logReassignLease(lease.getHolder(), src, newHolder);
4164    return reassignLeaseInternal(lease, src, newHolder, pendingFile);
4165  }
4166  
4167  Lease reassignLeaseInternal(Lease lease, String src, String newHolder,
4168      INodeFile pendingFile) {
4169    assert hasWriteLock();
4170    pendingFile.getFileUnderConstructionFeature().setClientName(newHolder);
4171    return leaseManager.reassignLease(lease, src, newHolder);
4172  }
4173
4174  private void commitOrCompleteLastBlock(final INodeFile fileINode,
4175      final INodesInPath iip, final Block commitBlock) throws IOException {
4176    assert hasWriteLock();
4177    Preconditions.checkArgument(fileINode.isUnderConstruction());
4178    if (!blockManager.commitOrCompleteLastBlock(fileINode, commitBlock)) {
4179      return;
4180    }
4181
4182    // Adjust disk space consumption if required
4183    final long diff = fileINode.getPreferredBlockSize() - commitBlock.getNumBytes();    
4184    if (diff > 0) {
4185      try {
4186        dir.updateSpaceConsumed(iip, 0, -diff, fileINode.getFileReplication());
4187      } catch (IOException e) {
4188        LOG.warn("Unexpected exception while updating disk space.", e);
4189      }
4190    }
4191  }
4192
4193  private void finalizeINodeFileUnderConstruction(String src,
4194      INodeFile pendingFile, int latestSnapshot) throws IOException {
4195    assert hasWriteLock();
4196
4197    FileUnderConstructionFeature uc = pendingFile.getFileUnderConstructionFeature();
4198    Preconditions.checkArgument(uc != null);
4199    leaseManager.removeLease(uc.getClientName(), src);
4200    
4201    pendingFile.recordModification(latestSnapshot);
4202
4203    // The file is no longer pending.
4204    // Create permanent INode, update blocks. No need to replace the inode here
4205    // since we just remove the uc feature from pendingFile
4206    pendingFile.toCompleteFile(now());
4207
4208    waitForLoadingFSImage();
4209    // close file and persist block allocations for this file
4210    closeFile(src, pendingFile);
4211
4212    blockManager.checkReplication(pendingFile);
4213  }
4214
4215  @VisibleForTesting
4216  BlockInfoContiguous getStoredBlock(Block block) {
4217    return blockManager.getStoredBlock(block);
4218  }
4219  
4220  @Override
4221  public boolean isInSnapshot(BlockInfoContiguousUnderConstruction blockUC) {
4222    assert hasReadLock();
4223    final BlockCollection bc = blockUC.getBlockCollection();
4224    if (bc == null || !(bc instanceof INodeFile)
4225        || !bc.isUnderConstruction()) {
4226      return false;
4227    }
4228
4229    String fullName = bc.getName();
4230    try {
4231      if (fullName != null && fullName.startsWith(Path.SEPARATOR)
4232          && dir.getINode(fullName) == bc) {
4233        // If file exists in normal path then no need to look in snapshot
4234        return false;
4235      }
4236    } catch (UnresolvedLinkException e) {
4237      LOG.error("Error while resolving the link : " + fullName, e);
4238      return false;
4239    }
4240    /*
4241     * 1. if bc is under construction and also with snapshot, and
4242     * bc is not in the current fsdirectory tree, bc must represent a snapshot
4243     * file. 
4244     * 2. if fullName is not an absolute path, bc cannot be existent in the 
4245     * current fsdirectory tree. 
4246     * 3. if bc is not the current node associated with fullName, bc must be a
4247     * snapshot inode.
4248     */
4249    return true;
4250  }
4251
4252  void commitBlockSynchronization(ExtendedBlock oldBlock,
4253      long newgenerationstamp, long newlength,
4254      boolean closeFile, boolean deleteblock, DatanodeID[] newtargets,
4255      String[] newtargetstorages) throws IOException {
4256    LOG.info("commitBlockSynchronization(oldBlock=" + oldBlock
4257             + ", newgenerationstamp=" + newgenerationstamp
4258             + ", newlength=" + newlength
4259             + ", newtargets=" + Arrays.asList(newtargets)
4260             + ", closeFile=" + closeFile
4261             + ", deleteBlock=" + deleteblock
4262             + ")");
4263    checkOperation(OperationCategory.WRITE);
4264    String src = "";
4265    waitForLoadingFSImage();
4266    writeLock();
4267    try {
4268      checkOperation(OperationCategory.WRITE);
4269      // If a DN tries to commit to the standby, the recovery will
4270      // fail, and the next retry will succeed on the new NN.
4271  
4272      checkNameNodeSafeMode(
4273          "Cannot commitBlockSynchronization while in safe mode");
4274      final BlockInfoContiguous storedBlock = getStoredBlock(
4275          ExtendedBlock.getLocalBlock(oldBlock));
4276      if (storedBlock == null) {
4277        if (deleteblock) {
4278          // This may be a retry attempt so ignore the failure
4279          // to locate the block.
4280          if (LOG.isDebugEnabled()) {
4281            LOG.debug("Block (=" + oldBlock + ") not found");
4282          }
4283          return;
4284        } else {
4285          throw new IOException("Block (=" + oldBlock + ") not found");
4286        }
4287      }
4288      final long oldGenerationStamp = storedBlock.getGenerationStamp();
4289      final long oldNumBytes = storedBlock.getNumBytes();
4290      //
4291      // The implementation of delete operation (see @deleteInternal method)
4292      // first removes the file paths from namespace, and delays the removal
4293      // of blocks to later time for better performance. When
4294      // commitBlockSynchronization (this method) is called in between, the
4295      // blockCollection of storedBlock could have been assigned to null by
4296      // the delete operation, throw IOException here instead of NPE; if the
4297      // file path is already removed from namespace by the delete operation,
4298      // throw FileNotFoundException here, so not to proceed to the end of
4299      // this method to add a CloseOp to the edit log for an already deleted
4300      // file (See HDFS-6825).
4301      //
4302      BlockCollection blockCollection = storedBlock.getBlockCollection();
4303      if (blockCollection == null) {
4304        throw new IOException("The blockCollection of " + storedBlock
4305            + " is null, likely because the file owning this block was"
4306            + " deleted and the block removal is delayed");
4307      }
4308      INodeFile iFile = ((INode)blockCollection).asFile();
4309      if (isFileDeleted(iFile)) {
4310        throw new FileNotFoundException("File not found: "
4311            + iFile.getFullPathName() + ", likely due to delayed block"
4312            + " removal");
4313      }
4314      if ((!iFile.isUnderConstruction() || storedBlock.isComplete()) &&
4315          iFile.getLastBlock().isComplete()) {
4316        if (LOG.isDebugEnabled()) {
4317          LOG.debug("Unexpected block (=" + oldBlock
4318                    + ") since the file (=" + iFile.getLocalName()
4319                    + ") is not under construction");
4320        }
4321        return;
4322      }
4323
4324      BlockInfoContiguousUnderConstruction truncatedBlock =
4325          (BlockInfoContiguousUnderConstruction) iFile.getLastBlock();
4326      long recoveryId = truncatedBlock.getBlockRecoveryId();
4327      boolean copyTruncate =
4328          truncatedBlock.getBlockId() != storedBlock.getBlockId();
4329      if(recoveryId != newgenerationstamp) {
4330        throw new IOException("The recovery id " + newgenerationstamp
4331                              + " does not match current recovery id "
4332                              + recoveryId + " for block " + oldBlock);
4333      }
4334
4335      if (deleteblock) {
4336        Block blockToDel = ExtendedBlock.getLocalBlock(oldBlock);
4337        boolean remove = iFile.removeLastBlock(blockToDel);
4338        if (remove) {
4339          blockManager.removeBlock(storedBlock);
4340        }
4341      }
4342      else {
4343        // update last block
4344        if(!copyTruncate) {
4345          storedBlock.setGenerationStamp(newgenerationstamp);
4346          storedBlock.setNumBytes(newlength);
4347        }
4348
4349        // find the DatanodeDescriptor objects
4350        ArrayList<DatanodeDescriptor> trimmedTargets =
4351            new ArrayList<DatanodeDescriptor>(newtargets.length);
4352        ArrayList<String> trimmedStorages =
4353            new ArrayList<String>(newtargets.length);
4354        if (newtargets.length > 0) {
4355          for (int i = 0; i < newtargets.length; ++i) {
4356            // try to get targetNode
4357            DatanodeDescriptor targetNode =
4358                blockManager.getDatanodeManager().getDatanode(newtargets[i]);
4359            if (targetNode != null) {
4360              trimmedTargets.add(targetNode);
4361              trimmedStorages.add(newtargetstorages[i]);
4362            } else if (LOG.isDebugEnabled()) {
4363              LOG.debug("DatanodeDescriptor (=" + newtargets[i] + ") not found");
4364            }
4365          }
4366        }
4367        if ((closeFile) && !trimmedTargets.isEmpty()) {
4368          // the file is getting closed. Insert block locations into blockManager.
4369          // Otherwise fsck will report these blocks as MISSING, especially if the
4370          // blocksReceived from Datanodes take a long time to arrive.
4371          for (int i = 0; i < trimmedTargets.size(); i++) {
4372            DatanodeStorageInfo storageInfo =
4373                trimmedTargets.get(i).getStorageInfo(trimmedStorages.get(i));
4374            if (storageInfo != null) {
4375              if(copyTruncate) {
4376                storageInfo.addBlock(truncatedBlock);
4377              } else {
4378                storageInfo.addBlock(storedBlock);
4379              }
4380            }
4381          }
4382        }
4383
4384        // add pipeline locations into the INodeUnderConstruction
4385        DatanodeStorageInfo[] trimmedStorageInfos =
4386            blockManager.getDatanodeManager().getDatanodeStorageInfos(
4387                trimmedTargets.toArray(new DatanodeID[trimmedTargets.size()]),
4388                trimmedStorages.toArray(new String[trimmedStorages.size()]));
4389        if(copyTruncate) {
4390          iFile.setLastBlock(truncatedBlock, trimmedStorageInfos);
4391        } else {
4392          iFile.setLastBlock(storedBlock, trimmedStorageInfos);
4393          if (closeFile) {
4394            blockManager.markBlockReplicasAsCorrupt(storedBlock,
4395                oldGenerationStamp, oldNumBytes, trimmedStorageInfos);
4396          }
4397        }
4398      }
4399
4400      if (closeFile) {
4401        if(copyTruncate) {
4402          src = closeFileCommitBlocks(iFile, truncatedBlock);
4403          if(!iFile.isBlockInLatestSnapshot(storedBlock)) {
4404            blockManager.removeBlock(storedBlock);
4405          }
4406        } else {
4407          src = closeFileCommitBlocks(iFile, storedBlock);
4408        }
4409      } else {
4410        // If this commit does not want to close the file, persist blocks
4411        src = iFile.getFullPathName();
4412        persistBlocks(src, iFile, false);
4413      }
4414    } finally {
4415      writeUnlock();
4416    }
4417    getEditLog().logSync();
4418    if (closeFile) {
4419      LOG.info("commitBlockSynchronization(oldBlock=" + oldBlock
4420          + ", file=" + src
4421          + ", newgenerationstamp=" + newgenerationstamp
4422          + ", newlength=" + newlength
4423          + ", newtargets=" + Arrays.asList(newtargets) + ") successful");
4424    } else {
4425      LOG.info("commitBlockSynchronization(" + oldBlock + ") successful");
4426    }
4427  }
4428
4429  /**
4430   * @param pendingFile open file that needs to be closed
4431   * @param storedBlock last block
4432   * @return Path of the file that was closed.
4433   * @throws IOException on error
4434   */
4435  @VisibleForTesting
4436  String closeFileCommitBlocks(INodeFile pendingFile, BlockInfoContiguous storedBlock)
4437      throws IOException {
4438    final INodesInPath iip = INodesInPath.fromINode(pendingFile);
4439    final String src = iip.getPath();
4440
4441    // commit the last block and complete it if it has minimum replicas
4442    commitOrCompleteLastBlock(pendingFile, iip, storedBlock);
4443
4444    //remove lease, close file
4445    finalizeINodeFileUnderConstruction(src, pendingFile,
4446        Snapshot.findLatestSnapshot(pendingFile, Snapshot.CURRENT_STATE_ID));
4447
4448    return src;
4449  }
4450
4451  /**
4452   * Renew the lease(s) held by the given client
4453   */
4454  void renewLease(String holder) throws IOException {
4455    checkOperation(OperationCategory.WRITE);
4456    readLock();
4457    try {
4458      checkOperation(OperationCategory.WRITE);
4459      checkNameNodeSafeMode("Cannot renew lease for " + holder);
4460      leaseManager.renewLease(holder);
4461    } finally {
4462      readUnlock();
4463    }
4464  }
4465
4466  /**
4467   * Get a partial listing of the indicated directory
4468   *
4469   * @param src the directory name
4470   * @param startAfter the name to start after
4471   * @param needLocation if blockLocations need to be returned
4472   * @return a partial listing starting after startAfter
4473   * 
4474   * @throws AccessControlException if access is denied
4475   * @throws UnresolvedLinkException if symbolic link is encountered
4476   * @throws IOException if other I/O error occurred
4477   */
4478  DirectoryListing getListing(String src, byte[] startAfter,
4479      boolean needLocation) 
4480      throws IOException {
4481    checkOperation(OperationCategory.READ);
4482    DirectoryListing dl = null;
4483    readLock();
4484    try {
4485      checkOperation(NameNode.OperationCategory.READ);
4486      dl = FSDirStatAndListingOp.getListingInt(dir, src, startAfter,
4487          needLocation);
4488    } catch (AccessControlException e) {
4489      logAuditEvent(false, "listStatus", src);
4490      throw e;
4491    } finally {
4492      readUnlock();
4493    }
4494    logAuditEvent(true, "listStatus", src);
4495    return dl;
4496  }
4497
4498  /////////////////////////////////////////////////////////
4499  //
4500  // These methods are called by datanodes
4501  //
4502  /////////////////////////////////////////////////////////
4503  /**
4504   * Register Datanode.
4505   * <p>
4506   * The purpose of registration is to identify whether the new datanode
4507   * serves a new data storage, and will report new data block copies,
4508   * which the namenode was not aware of; or the datanode is a replacement
4509   * node for the data storage that was previously served by a different
4510   * or the same (in terms of host:port) datanode.
4511   * The data storages are distinguished by their storageIDs. When a new
4512   * data storage is reported the namenode issues a new unique storageID.
4513   * <p>
4514   * Finally, the namenode returns its namespaceID as the registrationID
4515   * for the datanodes. 
4516   * namespaceID is a persistent attribute of the name space.
4517   * The registrationID is checked every time the datanode is communicating
4518   * with the namenode. 
4519   * Datanodes with inappropriate registrationID are rejected.
4520   * If the namenode stops, and then restarts it can restore its 
4521   * namespaceID and will continue serving the datanodes that has previously
4522   * registered with the namenode without restarting the whole cluster.
4523   * 
4524   * @see org.apache.hadoop.hdfs.server.datanode.DataNode
4525   */
4526  void registerDatanode(DatanodeRegistration nodeReg) throws IOException {
4527    writeLock();
4528    try {
4529      getBlockManager().getDatanodeManager().registerDatanode(nodeReg);
4530      checkSafeMode();
4531    } finally {
4532      writeUnlock();
4533    }
4534  }
4535  
4536  /**
4537   * Get registrationID for datanodes based on the namespaceID.
4538   * 
4539   * @see #registerDatanode(DatanodeRegistration)
4540   * @return registration ID
4541   */
4542  String getRegistrationID() {
4543    return Storage.getRegistrationID(getFSImage().getStorage());
4544  }
4545
4546  /**
4547   * The given node has reported in.  This method should:
4548   * 1) Record the heartbeat, so the datanode isn't timed out
4549   * 2) Adjust usage stats for future block allocation
4550   * 
4551   * If a substantial amount of time passed since the last datanode 
4552   * heartbeat then request an immediate block report.  
4553   * 
4554   * @return an array of datanode commands 
4555   * @throws IOException
4556   */
4557  HeartbeatResponse handleHeartbeat(DatanodeRegistration nodeReg,
4558      StorageReport[] reports, long cacheCapacity, long cacheUsed,
4559      int xceiverCount, int xmitsInProgress, int failedVolumes,
4560      VolumeFailureSummary volumeFailureSummary) throws IOException {
4561    readLock();
4562    try {
4563      //get datanode commands
4564      final int maxTransfer = blockManager.getMaxReplicationStreams()
4565          - xmitsInProgress;
4566      DatanodeCommand[] cmds = blockManager.getDatanodeManager().handleHeartbeat(
4567          nodeReg, reports, blockPoolId, cacheCapacity, cacheUsed,
4568          xceiverCount, maxTransfer, failedVolumes, volumeFailureSummary);
4569      
4570      //create ha status
4571      final NNHAStatusHeartbeat haState = new NNHAStatusHeartbeat(
4572          haContext.getState().getServiceState(),
4573          getFSImage().getLastAppliedOrWrittenTxId());
4574
4575      return new HeartbeatResponse(cmds, haState, rollingUpgradeInfo);
4576    } finally {
4577      readUnlock();
4578    }
4579  }
4580
4581  /**
4582   * Returns whether or not there were available resources at the last check of
4583   * resources.
4584   *
4585   * @return true if there were sufficient resources available, false otherwise.
4586   */
4587  boolean nameNodeHasResourcesAvailable() {
4588    return hasResourcesAvailable;
4589  }
4590
4591  /**
4592   * Perform resource checks and cache the results.
4593   */
4594  void checkAvailableResources() {
4595    Preconditions.checkState(nnResourceChecker != null,
4596        "nnResourceChecker not initialized");
4597    hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace();
4598  }
4599
4600  /**
4601   * Persist the block list for the inode.
4602   * @param path
4603   * @param file
4604   * @param logRetryCache
4605   */
4606  private void persistBlocks(String path, INodeFile file,
4607                             boolean logRetryCache) {
4608    assert hasWriteLock();
4609    Preconditions.checkArgument(file.isUnderConstruction());
4610    getEditLog().logUpdateBlocks(path, file, logRetryCache);
4611    NameNode.stateChangeLog.debug("persistBlocks: {} with {} blocks is" +
4612        " peristed to the file system", path, file.getBlocks().length);
4613  }
4614
4615  /**
4616   * Close file.
4617   * @param path
4618   * @param file
4619   */
4620  private void closeFile(String path, INodeFile file) {
4621    assert hasWriteLock();
4622    waitForLoadingFSImage();
4623    // file is closed
4624    getEditLog().logCloseFile(path, file);
4625    NameNode.stateChangeLog.debug("closeFile: {} with {} blocks is persisted" +
4626        " to the file system", path, file.getBlocks().length);
4627  }
4628
4629  /**
4630   * Periodically calls hasAvailableResources of NameNodeResourceChecker, and if
4631   * there are found to be insufficient resources available, causes the NN to
4632   * enter safe mode. If resources are later found to have returned to
4633   * acceptable levels, this daemon will cause the NN to exit safe mode.
4634   */
4635  class NameNodeResourceMonitor implements Runnable  {
4636    boolean shouldNNRmRun = true;
4637    @Override
4638    public void run () {
4639      try {
4640        while (fsRunning && shouldNNRmRun) {
4641          checkAvailableResources();
4642          if(!nameNodeHasResourcesAvailable()) {
4643            String lowResourcesMsg = "NameNode low on available disk space. ";
4644            if (!isInSafeMode()) {
4645              LOG.warn(lowResourcesMsg + "Entering safe mode.");
4646            } else {
4647              LOG.warn(lowResourcesMsg + "Already in safe mode.");
4648            }
4649            enterSafeMode(true);
4650          }
4651          try {
4652            Thread.sleep(resourceRecheckInterval);
4653          } catch (InterruptedException ie) {
4654            // Deliberately ignore
4655          }
4656        }
4657      } catch (Exception e) {
4658        FSNamesystem.LOG.error("Exception in NameNodeResourceMonitor: ", e);
4659      }
4660    }
4661
4662    public void stopMonitor() {
4663      shouldNNRmRun = false;
4664    }
4665 }
4666
4667  class NameNodeEditLogRoller implements Runnable {
4668
4669    private boolean shouldRun = true;
4670    private final long rollThreshold;
4671    private final long sleepIntervalMs;
4672
4673    public NameNodeEditLogRoller(long rollThreshold, int sleepIntervalMs) {
4674        this.rollThreshold = rollThreshold;
4675        this.sleepIntervalMs = sleepIntervalMs;
4676    }
4677
4678    @Override
4679    public void run() {
4680      while (fsRunning && shouldRun) {
4681        try {
4682          FSEditLog editLog = getFSImage().getEditLog();
4683          long numEdits =
4684              editLog.getLastWrittenTxId() - editLog.getCurSegmentTxId();
4685          if (numEdits > rollThreshold) {
4686            FSNamesystem.LOG.info("NameNode rolling its own edit log because"
4687                + " number of edits in open segment exceeds threshold of "
4688                + rollThreshold);
4689            rollEditLog();
4690          }
4691        } catch (Exception e) {
4692          FSNamesystem.LOG.error("Swallowing exception in "
4693              + NameNodeEditLogRoller.class.getSimpleName() + ":", e);
4694        }
4695        try {
4696          Thread.sleep(sleepIntervalMs);
4697        } catch (InterruptedException e) {
4698          FSNamesystem.LOG.info(NameNodeEditLogRoller.class.getSimpleName()
4699              + " was interrupted, exiting");
4700          break;
4701        }
4702      }
4703    }
4704
4705    public void stop() {
4706      shouldRun = false;
4707    }
4708  }
4709
4710  /**
4711   * Daemon to periodically scan the namespace for lazyPersist files
4712   * with missing blocks and unlink them.
4713   */
4714  class LazyPersistFileScrubber implements Runnable {
4715    private volatile boolean shouldRun = true;
4716    final int scrubIntervalSec;
4717    public LazyPersistFileScrubber(final int scrubIntervalSec) {
4718      this.scrubIntervalSec = scrubIntervalSec;
4719    }
4720
4721    /**
4722     * Periodically go over the list of lazyPersist files with missing
4723     * blocks and unlink them from the namespace.
4724     */
4725    private void clearCorruptLazyPersistFiles()
4726        throws IOException {
4727
4728      BlockStoragePolicy lpPolicy = blockManager.getStoragePolicy("LAZY_PERSIST");
4729
4730      List<BlockCollection> filesToDelete = new ArrayList<>();
4731      boolean changed = false;
4732      writeLock();
4733      try {
4734        final Iterator<Block> it = blockManager.getCorruptReplicaBlockIterator();
4735
4736        while (it.hasNext()) {
4737          Block b = it.next();
4738          BlockInfoContiguous blockInfo = blockManager.getStoredBlock(b);
4739          if (blockInfo.getBlockCollection().getStoragePolicyID()
4740              == lpPolicy.getId()) {
4741            filesToDelete.add(blockInfo.getBlockCollection());
4742          }
4743        }
4744
4745        for (BlockCollection bc : filesToDelete) {
4746          LOG.warn("Removing lazyPersist file " + bc.getName() + " with no replicas.");
4747          BlocksMapUpdateInfo toRemoveBlocks =
4748              FSDirDeleteOp.deleteInternal(
4749                  FSNamesystem.this, bc.getName(),
4750                  INodesInPath.fromINode((INodeFile) bc), false);
4751          changed |= toRemoveBlocks != null;
4752          if (toRemoveBlocks != null) {
4753            removeBlocks(toRemoveBlocks); // Incremental deletion of blocks
4754          }
4755        }
4756      } finally {
4757        writeUnlock();
4758      }
4759      if (changed) {
4760        getEditLog().logSync();
4761      }
4762    }
4763
4764    @Override
4765    public void run() {
4766      while (fsRunning && shouldRun) {
4767        try {
4768          clearCorruptLazyPersistFiles();
4769          Thread.sleep(scrubIntervalSec * 1000);
4770        } catch (InterruptedException e) {
4771          FSNamesystem.LOG.info(
4772              "LazyPersistFileScrubber was interrupted, exiting");
4773          break;
4774        } catch (Exception e) {
4775          FSNamesystem.LOG.error(
4776              "Ignoring exception in LazyPersistFileScrubber:", e);
4777        }
4778      }
4779    }
4780
4781    public void stop() {
4782      shouldRun = false;
4783    }
4784  }
4785
4786  public FSImage getFSImage() {
4787    return fsImage;
4788  }
4789
4790  public FSEditLog getEditLog() {
4791    return getFSImage().getEditLog();
4792  }    
4793
4794  private void checkBlock(ExtendedBlock block) throws IOException {
4795    if (block != null && !this.blockPoolId.equals(block.getBlockPoolId())) {
4796      throw new IOException("Unexpected BlockPoolId " + block.getBlockPoolId()
4797          + " - expected " + blockPoolId);
4798    }
4799  }
4800
4801  @Metric({"MissingBlocks", "Number of missing blocks"})
4802  public long getMissingBlocksCount() {
4803    // not locking
4804    return blockManager.getMissingBlocksCount();
4805  }
4806
4807  @Metric({"MissingReplOneBlocks", "Number of missing blocks " +
4808      "with replication factor 1"})
4809  public long getMissingReplOneBlocksCount() {
4810    // not locking
4811    return blockManager.getMissingReplOneBlocksCount();
4812  }
4813  
4814  @Metric({"ExpiredHeartbeats", "Number of expired heartbeats"})
4815  public int getExpiredHeartbeats() {
4816    return datanodeStatistics.getExpiredHeartbeats();
4817  }
4818  
4819  @Metric({"TransactionsSinceLastCheckpoint",
4820      "Number of transactions since last checkpoint"})
4821  public long getTransactionsSinceLastCheckpoint() {
4822    return getEditLog().getLastWrittenTxId() -
4823        getFSImage().getStorage().getMostRecentCheckpointTxId();
4824  }
4825  
4826  @Metric({"TransactionsSinceLastLogRoll",
4827      "Number of transactions since last edit log roll"})
4828  public long getTransactionsSinceLastLogRoll() {
4829    if (isInStandbyState() || !getEditLog().isSegmentOpen()) {
4830      return 0;
4831    } else {
4832      return getEditLog().getLastWrittenTxId() -
4833        getEditLog().getCurSegmentTxId() + 1;
4834    }
4835  }
4836  
4837  @Metric({"LastWrittenTransactionId", "Transaction ID written to the edit log"})
4838  public long getLastWrittenTransactionId() {
4839    return getEditLog().getLastWrittenTxId();
4840  }
4841  
4842  @Metric({"LastCheckpointTime",
4843      "Time in milliseconds since the epoch of the last checkpoint"})
4844  public long getLastCheckpointTime() {
4845    return getFSImage().getStorage().getMostRecentCheckpointTime();
4846  }
4847
4848  /** @see ClientProtocol#getStats() */
4849  long[] getStats() {
4850    final long[] stats = datanodeStatistics.getStats();
4851    stats[ClientProtocol.GET_STATS_UNDER_REPLICATED_IDX] = getUnderReplicatedBlocks();
4852    stats[ClientProtocol.GET_STATS_CORRUPT_BLOCKS_IDX] = getCorruptReplicaBlocks();
4853    stats[ClientProtocol.GET_STATS_MISSING_BLOCKS_IDX] = getMissingBlocksCount();
4854    stats[ClientProtocol.GET_STATS_MISSING_REPL_ONE_BLOCKS_IDX] =
4855        getMissingReplOneBlocksCount();
4856    return stats;
4857  }
4858
4859  @Override // FSNamesystemMBean
4860  @Metric({"CapacityTotal",
4861      "Total raw capacity of data nodes in bytes"})
4862  public long getCapacityTotal() {
4863    return datanodeStatistics.getCapacityTotal();
4864  }
4865
4866  @Metric({"CapacityTotalGB",
4867      "Total raw capacity of data nodes in GB"})
4868  public float getCapacityTotalGB() {
4869    return DFSUtil.roundBytesToGB(getCapacityTotal());
4870  }
4871
4872  @Override // FSNamesystemMBean
4873  @Metric({"CapacityUsed",
4874      "Total used capacity across all data nodes in bytes"})
4875  public long getCapacityUsed() {
4876    return datanodeStatistics.getCapacityUsed();
4877  }
4878
4879  @Metric({"CapacityUsedGB",
4880      "Total used capacity across all data nodes in GB"})
4881  public float getCapacityUsedGB() {
4882    return DFSUtil.roundBytesToGB(getCapacityUsed());
4883  }
4884
4885  @Override // FSNamesystemMBean
4886  @Metric({"CapacityRemaining", "Remaining capacity in bytes"})
4887  public long getCapacityRemaining() {
4888    return datanodeStatistics.getCapacityRemaining();
4889  }
4890
4891  @Metric({"CapacityRemainingGB", "Remaining capacity in GB"})
4892  public float getCapacityRemainingGB() {
4893    return DFSUtil.roundBytesToGB(getCapacityRemaining());
4894  }
4895
4896  @Metric({"CapacityUsedNonDFS",
4897      "Total space used by data nodes for non DFS purposes in bytes"})
4898  public long getCapacityUsedNonDFS() {
4899    return datanodeStatistics.getCapacityUsedNonDFS();
4900  }
4901
4902  /**
4903   * Total number of connections.
4904   */
4905  @Override // FSNamesystemMBean
4906  @Metric
4907  public int getTotalLoad() {
4908    return datanodeStatistics.getXceiverCount();
4909  }
4910  
4911  @Metric({ "SnapshottableDirectories", "Number of snapshottable directories" })
4912  public int getNumSnapshottableDirs() {
4913    return this.snapshotManager.getNumSnapshottableDirs();
4914  }
4915
4916  @Metric({ "Snapshots", "The number of snapshots" })
4917  public int getNumSnapshots() {
4918    return this.snapshotManager.getNumSnapshots();
4919  }
4920
4921  @Override
4922  public String getSnapshotStats() {
4923    Map<String, Object> info = new HashMap<String, Object>();
4924    info.put("SnapshottableDirectories", this.getNumSnapshottableDirs());
4925    info.put("Snapshots", this.getNumSnapshots());
4926    return JSON.toString(info);
4927  }
4928
4929  int getNumberOfDatanodes(DatanodeReportType type) {
4930    readLock();
4931    try {
4932      return getBlockManager().getDatanodeManager().getDatanodeListForReport(
4933          type).size(); 
4934    } finally {
4935      readUnlock();
4936    }
4937  }
4938
4939  DatanodeInfo[] datanodeReport(final DatanodeReportType type
4940      ) throws AccessControlException, StandbyException {
4941    checkSuperuserPrivilege();
4942    checkOperation(OperationCategory.UNCHECKED);
4943    readLock();
4944    try {
4945      checkOperation(OperationCategory.UNCHECKED);
4946      final DatanodeManager dm = getBlockManager().getDatanodeManager();      
4947      final List<DatanodeDescriptor> results = dm.getDatanodeListForReport(type);
4948
4949      DatanodeInfo[] arr = new DatanodeInfo[results.size()];
4950      for (int i=0; i<arr.length; i++) {
4951        arr[i] = new DatanodeInfo(results.get(i));
4952      }
4953      return arr;
4954    } finally {
4955      readUnlock();
4956    }
4957  }
4958
4959  DatanodeStorageReport[] getDatanodeStorageReport(final DatanodeReportType type
4960      ) throws AccessControlException, StandbyException {
4961    checkSuperuserPrivilege();
4962    checkOperation(OperationCategory.UNCHECKED);
4963    readLock();
4964    try {
4965      checkOperation(OperationCategory.UNCHECKED);
4966      final DatanodeManager dm = getBlockManager().getDatanodeManager();      
4967      final List<DatanodeDescriptor> datanodes = dm.getDatanodeListForReport(type);
4968
4969      DatanodeStorageReport[] reports = new DatanodeStorageReport[datanodes.size()];
4970      for (int i = 0; i < reports.length; i++) {
4971        final DatanodeDescriptor d = datanodes.get(i);
4972        reports[i] = new DatanodeStorageReport(new DatanodeInfo(d),
4973            d.getStorageReports());
4974      }
4975      return reports;
4976    } finally {
4977      readUnlock();
4978    }
4979  }
4980
4981  /**
4982   * Save namespace image.
4983   * This will save current namespace into fsimage file and empty edits file.
4984   * Requires superuser privilege and safe mode.
4985   * 
4986   * @throws AccessControlException if superuser privilege is violated.
4987   * @throws IOException if 
4988   */
4989  void saveNamespace() throws AccessControlException, IOException {
4990    checkOperation(OperationCategory.UNCHECKED);
4991    checkSuperuserPrivilege();
4992
4993    cpLock();  // Block if a checkpointing is in progress on standby.
4994    readLock();
4995    try {
4996      checkOperation(OperationCategory.UNCHECKED);
4997
4998      if (!isInSafeMode()) {
4999        throw new IOException("Safe mode should be turned ON "
5000            + "in order to create namespace image.");
5001      }
5002      getFSImage().saveNamespace(this);
5003    } finally {
5004      readUnlock();
5005      cpUnlock();
5006    }
5007    LOG.info("New namespace image has been created");
5008  }
5009  
5010  /**
5011   * Enables/Disables/Checks restoring failed storage replicas if the storage becomes available again.
5012   * Requires superuser privilege.
5013   * 
5014   * @throws AccessControlException if superuser privilege is violated.
5015   */
5016  boolean restoreFailedStorage(String arg) throws AccessControlException,
5017      StandbyException {
5018    checkSuperuserPrivilege();
5019    checkOperation(OperationCategory.UNCHECKED);
5020    cpLock();  // Block if a checkpointing is in progress on standby.
5021    writeLock();
5022    try {
5023      checkOperation(OperationCategory.UNCHECKED);
5024      
5025      // if it is disabled - enable it and vice versa.
5026      if(arg.equals("check"))
5027        return getFSImage().getStorage().getRestoreFailedStorage();
5028      
5029      boolean val = arg.equals("true");  // false if not
5030      getFSImage().getStorage().setRestoreFailedStorage(val);
5031      
5032      return val;
5033    } finally {
5034      writeUnlock();
5035      cpUnlock();
5036    }
5037  }
5038
5039  Date getStartTime() {
5040    return new Date(startTime); 
5041  }
5042    
5043  void finalizeUpgrade() throws IOException {
5044    checkSuperuserPrivilege();
5045    checkOperation(OperationCategory.UNCHECKED);
5046    cpLock();  // Block if a checkpointing is in progress on standby.
5047    writeLock();
5048    try {
5049      checkOperation(OperationCategory.UNCHECKED);
5050      getFSImage().finalizeUpgrade(this.isHaEnabled() && inActiveState());
5051    } finally {
5052      writeUnlock();
5053      cpUnlock();
5054    }
5055  }
5056
5057  void refreshNodes() throws IOException {
5058    checkOperation(OperationCategory.UNCHECKED);
5059    checkSuperuserPrivilege();
5060    getBlockManager().getDatanodeManager().refreshNodes(new HdfsConfiguration());
5061  }
5062
5063  void setBalancerBandwidth(long bandwidth) throws IOException {
5064    checkOperation(OperationCategory.UNCHECKED);
5065    checkSuperuserPrivilege();
5066    getBlockManager().getDatanodeManager().setBalancerBandwidth(bandwidth);
5067  }
5068
5069  /**
5070   * Persist the new block (the last block of the given file).
5071   * @param path
5072   * @param file
5073   */
5074  private void persistNewBlock(String path, INodeFile file) {
5075    Preconditions.checkArgument(file.isUnderConstruction());
5076    getEditLog().logAddBlock(path, file);
5077    NameNode.stateChangeLog.debug("persistNewBlock: {} with new block {}," +
5078        " current total block count is {}", path,
5079        file.getLastBlock().toString(), file.getBlocks().length);
5080  }
5081
5082  /**
5083   * SafeModeInfo contains information related to the safe mode.
5084   * <p>
5085   * An instance of {@link SafeModeInfo} is created when the name node
5086   * enters safe mode.
5087   * <p>
5088   * During name node startup {@link SafeModeInfo} counts the number of
5089   * <em>safe blocks</em>, those that have at least the minimal number of
5090   * replicas, and calculates the ratio of safe blocks to the total number
5091   * of blocks in the system, which is the size of blocks in
5092   * {@link FSNamesystem#blockManager}. When the ratio reaches the
5093   * {@link #threshold} it starts the SafeModeMonitor daemon in order
5094   * to monitor whether the safe mode {@link #extension} is passed.
5095   * Then it leaves safe mode and destroys itself.
5096   * <p>
5097   * If safe mode is turned on manually then the number of safe blocks is
5098   * not tracked because the name node is not intended to leave safe mode
5099   * automatically in the case.
5100   *
5101   * @see ClientProtocol#setSafeMode(HdfsConstants.SafeModeAction, boolean)
5102   */
5103  public class SafeModeInfo {
5104    // configuration fields
5105    /** Safe mode threshold condition %.*/
5106    private final double threshold;
5107    /** Safe mode minimum number of datanodes alive */
5108    private final int datanodeThreshold;
5109    /**
5110     * Safe mode extension after the threshold.
5111     * Make it volatile so that getSafeModeTip can read the latest value
5112     * without taking a lock.
5113     */
5114    private volatile int extension;
5115    /** Min replication required by safe mode. */
5116    private final int safeReplication;
5117    /** threshold for populating needed replication queues */
5118    private final double replQueueThreshold;
5119    // internal fields
5120    /** Time when threshold was reached.
5121     * <br> -1 safe mode is off
5122     * <br> 0 safe mode is on, and threshold is not reached yet
5123     * <br> >0 safe mode is on, but we are in extension period 
5124     */
5125    private long reached = -1;  
5126    private long reachedTimestamp = -1;
5127    /** Total number of blocks. */
5128    int blockTotal; 
5129    /** Number of safe blocks. */
5130    int blockSafe;
5131    /** Number of blocks needed to satisfy safe mode threshold condition */
5132    private int blockThreshold;
5133    /** Number of blocks needed before populating replication queues */
5134    private int blockReplQueueThreshold;
5135    /** time of the last status printout */
5136    private long lastStatusReport = 0;
5137    /**
5138     * Was safemode entered automatically because available resources were low.
5139     * Make it volatile so that getSafeModeTip can read the latest value
5140     * without taking a lock.
5141     */
5142    private volatile boolean resourcesLow = false;
5143    /** Should safemode adjust its block totals as blocks come in */
5144    private boolean shouldIncrementallyTrackBlocks = false;
5145    /** counter for tracking startup progress of reported blocks */
5146    private Counter awaitingReportedBlocksCounter;
5147    
5148    /**
5149     * Creates SafeModeInfo when the name node enters
5150     * automatic safe mode at startup.
5151     *  
5152     * @param conf configuration
5153     */
5154    private SafeModeInfo(Configuration conf) {
5155      this.threshold = conf.getFloat(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY,
5156          DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT);
5157      if(threshold > 1.0) {
5158        LOG.warn("The threshold value should't be greater than 1, threshold: " + threshold);
5159      }
5160      this.datanodeThreshold = conf.getInt(
5161        DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY,
5162        DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT);
5163      this.extension = conf.getInt(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 0);
5164      this.safeReplication = conf.getInt(DFS_NAMENODE_REPLICATION_MIN_KEY, 
5165                                         DFS_NAMENODE_REPLICATION_MIN_DEFAULT);
5166      
5167      LOG.info(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY + " = " + threshold);
5168      LOG.info(DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY + " = " + datanodeThreshold);
5169      LOG.info(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY + "     = " + extension);
5170
5171      // default to safe mode threshold (i.e., don't populate queues before leaving safe mode)
5172      this.replQueueThreshold = 
5173        conf.getFloat(DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY,
5174                      (float) threshold);
5175      this.blockTotal = 0; 
5176      this.blockSafe = 0;
5177    }
5178
5179    /**
5180     * In the HA case, the StandbyNode can be in safemode while the namespace
5181     * is modified by the edit log tailer. In this case, the number of total
5182     * blocks changes as edits are processed (eg blocks are added and deleted).
5183     * However, we don't want to do the incremental tracking during the
5184     * startup-time loading process -- only once the initial total has been
5185     * set after the image has been loaded.
5186     */
5187    private boolean shouldIncrementallyTrackBlocks() {
5188      return shouldIncrementallyTrackBlocks;
5189    }
5190
5191    /**
5192     * Creates SafeModeInfo when safe mode is entered manually, or because
5193     * available resources are low.
5194     *
5195     * The {@link #threshold} is set to 1.5 so that it could never be reached.
5196     * {@link #blockTotal} is set to -1 to indicate that safe mode is manual.
5197     * 
5198     * @see SafeModeInfo
5199     */
5200    private SafeModeInfo(boolean resourcesLow) {
5201      this.threshold = 1.5f;  // this threshold can never be reached
5202      this.datanodeThreshold = Integer.MAX_VALUE;
5203      this.extension = Integer.MAX_VALUE;
5204      this.safeReplication = Short.MAX_VALUE + 1; // more than maxReplication
5205      this.replQueueThreshold = 1.5f; // can never be reached
5206      this.blockTotal = -1;
5207      this.blockSafe = -1;
5208      this.resourcesLow = resourcesLow;
5209      enter();
5210      reportStatus("STATE* Safe mode is ON.", true);
5211    }
5212      
5213    /**
5214     * Check if safe mode is on.
5215     * @return true if in safe mode
5216     */
5217    private synchronized boolean isOn() {
5218      doConsistencyCheck();
5219      return this.reached >= 0;
5220    }
5221      
5222    /**
5223     * Enter safe mode.
5224     */
5225    private void enter() {
5226      this.reached = 0;
5227      this.reachedTimestamp = 0;
5228    }
5229      
5230    /**
5231     * Leave safe mode.
5232     * <p>
5233     * Check for invalid, under- & over-replicated blocks in the end of startup.
5234     */
5235    private synchronized void leave() {
5236      // if not done yet, initialize replication queues.
5237      // In the standby, do not populate repl queues
5238      if (!isPopulatingReplQueues() && shouldPopulateReplQueues()) {
5239        initializeReplQueues();
5240      }
5241      long timeInSafemode = now() - startTime;
5242      NameNode.stateChangeLog.info("STATE* Leaving safe mode after " 
5243                                    + timeInSafemode/1000 + " secs");
5244      NameNode.getNameNodeMetrics().setSafeModeTime((int) timeInSafemode);
5245
5246      //Log the following only once (when transitioning from ON -> OFF)
5247      if (reached >= 0) {
5248        NameNode.stateChangeLog.info("STATE* Safe mode is OFF"); 
5249      }
5250      reached = -1;
5251      reachedTimestamp = -1;
5252      safeMode = null;
5253      final NetworkTopology nt = blockManager.getDatanodeManager().getNetworkTopology();
5254      NameNode.stateChangeLog.info("STATE* Network topology has "
5255          + nt.getNumOfRacks() + " racks and "
5256          + nt.getNumOfLeaves() + " datanodes");
5257      NameNode.stateChangeLog.info("STATE* UnderReplicatedBlocks has "
5258          + blockManager.numOfUnderReplicatedBlocks() + " blocks");
5259
5260      startSecretManagerIfNecessary();
5261
5262      // If startup has not yet completed, end safemode phase.
5263      StartupProgress prog = NameNode.getStartupProgress();
5264      if (prog.getStatus(Phase.SAFEMODE) != Status.COMPLETE) {
5265        prog.endStep(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS);
5266        prog.endPhase(Phase.SAFEMODE);
5267      }
5268    }
5269
5270    /**
5271     * Check whether we have reached the threshold for 
5272     * initializing replication queues.
5273     */
5274    private synchronized boolean canInitializeReplQueues() {
5275      return shouldPopulateReplQueues()
5276          && blockSafe >= blockReplQueueThreshold;
5277    }
5278      
5279    /** 
5280     * Safe mode can be turned off iff 
5281     * the threshold is reached and 
5282     * the extension time have passed.
5283     * @return true if can leave or false otherwise.
5284     */
5285    private synchronized boolean canLeave() {
5286      if (reached == 0) {
5287        return false;
5288      }
5289
5290      if (monotonicNow() - reached < extension) {
5291        reportStatus("STATE* Safe mode ON, in safe mode extension.", false);
5292        return false;
5293      }
5294
5295      if (needEnter()) {
5296        reportStatus("STATE* Safe mode ON, thresholds not met.", false);
5297        return false;
5298      }
5299
5300      return true;
5301    }
5302      
5303    /** 
5304     * There is no need to enter safe mode 
5305     * if DFS is empty or {@link #threshold} == 0
5306     */
5307    private boolean needEnter() {
5308      return (threshold != 0 && blockSafe < blockThreshold) ||
5309        (datanodeThreshold != 0 && getNumLiveDataNodes() < datanodeThreshold) ||
5310        (!nameNodeHasResourcesAvailable());
5311    }
5312      
5313    /**
5314     * Check and trigger safe mode if needed. 
5315     */
5316    private void checkMode() {
5317      // Have to have write-lock since leaving safemode initializes
5318      // repl queues, which requires write lock
5319      assert hasWriteLock();
5320      if (inTransitionToActive()) {
5321        return;
5322      }
5323      // if smmthread is already running, the block threshold must have been 
5324      // reached before, there is no need to enter the safe mode again
5325      if (smmthread == null && needEnter()) {
5326        enter();
5327        // check if we are ready to initialize replication queues
5328        if (canInitializeReplQueues() && !isPopulatingReplQueues()
5329            && !haEnabled) {
5330          initializeReplQueues();
5331        }
5332        reportStatus("STATE* Safe mode ON.", false);
5333        return;
5334      }
5335      // the threshold is reached or was reached before
5336      if (!isOn() ||                           // safe mode is off
5337          extension <= 0 || threshold <= 0) {  // don't need to wait
5338        this.leave(); // leave safe mode
5339        return;
5340      }
5341      if (reached > 0) {  // threshold has already been reached before
5342        reportStatus("STATE* Safe mode ON.", false);
5343        return;
5344      }
5345      // start monitor
5346      reached = monotonicNow();
5347      reachedTimestamp = now();
5348      if (smmthread == null) {
5349        smmthread = new Daemon(new SafeModeMonitor());
5350        smmthread.start();
5351        reportStatus("STATE* Safe mode extension entered.", true);
5352      }
5353
5354      // check if we are ready to initialize replication queues
5355      if (canInitializeReplQueues() && !isPopulatingReplQueues() && !haEnabled) {
5356        initializeReplQueues();
5357      }
5358    }
5359      
5360    /**
5361     * Set total number of blocks.
5362     */
5363    private synchronized void setBlockTotal(int total) {
5364      this.blockTotal = total;
5365      this.blockThreshold = (int) (blockTotal * threshold);
5366      this.blockReplQueueThreshold = 
5367        (int) (blockTotal * replQueueThreshold);
5368      if (haEnabled) {
5369        // After we initialize the block count, any further namespace
5370        // modifications done while in safe mode need to keep track
5371        // of the number of total blocks in the system.
5372        this.shouldIncrementallyTrackBlocks = true;
5373      }
5374      if(blockSafe < 0)
5375        this.blockSafe = 0;
5376      checkMode();
5377    }
5378      
5379    /**
5380     * Increment number of safe blocks if current block has 
5381     * reached minimal replication.
5382     * @param replication current replication 
5383     */
5384    private synchronized void incrementSafeBlockCount(short replication) {
5385      if (replication == safeReplication) {
5386        this.blockSafe++;
5387
5388        // Report startup progress only if we haven't completed startup yet.
5389        StartupProgress prog = NameNode.getStartupProgress();
5390        if (prog.getStatus(Phase.SAFEMODE) != Status.COMPLETE) {
5391          if (this.awaitingReportedBlocksCounter == null) {
5392            this.awaitingReportedBlocksCounter = prog.getCounter(Phase.SAFEMODE,
5393              STEP_AWAITING_REPORTED_BLOCKS);
5394          }
5395          this.awaitingReportedBlocksCounter.increment();
5396        }
5397
5398        checkMode();
5399      }
5400    }
5401      
5402    /**
5403     * Decrement number of safe blocks if current block has 
5404     * fallen below minimal replication.
5405     * @param replication current replication 
5406     */
5407    private synchronized void decrementSafeBlockCount(short replication) {
5408      if (replication == safeReplication-1) {
5409        this.blockSafe--;
5410        //blockSafe is set to -1 in manual / low resources safemode
5411        assert blockSafe >= 0 || isManual() || areResourcesLow();
5412        checkMode();
5413      }
5414    }
5415
5416    /**
5417     * Check if safe mode was entered manually
5418     */
5419    private boolean isManual() {
5420      return extension == Integer.MAX_VALUE;
5421    }
5422
5423    /**
5424     * Set manual safe mode.
5425     */
5426    private synchronized void setManual() {
5427      extension = Integer.MAX_VALUE;
5428    }
5429
5430    /**
5431     * Check if safe mode was entered due to resources being low.
5432     */
5433    private boolean areResourcesLow() {
5434      return resourcesLow;
5435    }
5436
5437    /**
5438     * Set that resources are low for this instance of safe mode.
5439     */
5440    private void setResourcesLow() {
5441      resourcesLow = true;
5442    }
5443
5444    /**
5445     * A tip on how safe mode is to be turned off: manually or automatically.
5446     */
5447    String getTurnOffTip() {
5448      if(!isOn()) {
5449        return "Safe mode is OFF.";
5450      }
5451
5452      //Manual OR low-resource safemode. (Admin intervention required)
5453      String adminMsg = "It was turned on manually. ";
5454      if (areResourcesLow()) {
5455        adminMsg = "Resources are low on NN. Please add or free up more "
5456          + "resources then turn off safe mode manually. NOTE:  If you turn off"
5457          + " safe mode before adding resources, "
5458          + "the NN will immediately return to safe mode. ";
5459      }
5460      if (isManual() || areResourcesLow()) {
5461        return adminMsg
5462          + "Use \"hdfs dfsadmin -safemode leave\" to turn safe mode off.";
5463      }
5464
5465      boolean thresholdsMet = true;
5466      int numLive = getNumLiveDataNodes();
5467      String msg = "";
5468      if (blockSafe < blockThreshold) {
5469        msg += String.format(
5470          "The reported blocks %d needs additional %d"
5471          + " blocks to reach the threshold %.4f of total blocks %d.%n",
5472          blockSafe, (blockThreshold - blockSafe) + 1, threshold, blockTotal);
5473        thresholdsMet = false;
5474      } else {
5475        msg += String.format("The reported blocks %d has reached the threshold"
5476            + " %.4f of total blocks %d. ", blockSafe, threshold, blockTotal);
5477      }
5478      if (numLive < datanodeThreshold) {
5479        msg += String.format(
5480          "The number of live datanodes %d needs an additional %d live "
5481          + "datanodes to reach the minimum number %d.%n",
5482          numLive, (datanodeThreshold - numLive), datanodeThreshold);
5483        thresholdsMet = false;
5484      } else {
5485        msg += String.format("The number of live datanodes %d has reached "
5486            + "the minimum number %d. ",
5487            numLive, datanodeThreshold);
5488      }
5489      msg += (reached > 0) ? "In safe mode extension. " : "";
5490      msg += "Safe mode will be turned off automatically ";
5491
5492      if (!thresholdsMet) {
5493        msg += "once the thresholds have been reached.";
5494      } else if (reached + extension - monotonicNow() > 0) {
5495        msg += ("in " + (reached + extension - monotonicNow()) / 1000 + " seconds.");
5496      } else {
5497        msg += "soon.";
5498      }
5499
5500      return msg;
5501    }
5502
5503    /**
5504     * Print status every 20 seconds.
5505     */
5506    private void reportStatus(String msg, boolean rightNow) {
5507      long curTime = now();
5508      if(!rightNow && (curTime - lastStatusReport < 20 * 1000))
5509        return;
5510      NameNode.stateChangeLog.info(msg + " \n" + getTurnOffTip());
5511      lastStatusReport = curTime;
5512    }
5513
5514    @Override
5515    public String toString() {
5516      String resText = "Current safe blocks = " 
5517        + blockSafe 
5518        + ". Target blocks = " + blockThreshold + " for threshold = %" + threshold
5519        + ". Minimal replication = " + safeReplication + ".";
5520      if (reached > 0) 
5521        resText += " Threshold was reached " + new Date(reachedTimestamp) + ".";
5522      return resText;
5523    }
5524      
5525    /**
5526     * Checks consistency of the class state.
5527     * This is costly so only runs if asserts are enabled.
5528     */
5529    private void doConsistencyCheck() {
5530      boolean assertsOn = false;
5531      assert assertsOn = true; // set to true if asserts are on
5532      if (!assertsOn) return;
5533      
5534      if (blockTotal == -1 && blockSafe == -1) {
5535        return; // manual safe mode
5536      }
5537      int activeBlocks = blockManager.getActiveBlockCount();
5538      if ((blockTotal != activeBlocks) &&
5539          !(blockSafe >= 0 && blockSafe <= blockTotal)) {
5540        throw new AssertionError(
5541            " SafeMode: Inconsistent filesystem state: "
5542        + "SafeMode data: blockTotal=" + blockTotal
5543        + " blockSafe=" + blockSafe + "; "
5544        + "BlockManager data: active="  + activeBlocks);
5545      }
5546    }
5547
5548    private synchronized void adjustBlockTotals(int deltaSafe, int deltaTotal) {
5549      if (!shouldIncrementallyTrackBlocks) {
5550        return;
5551      }
5552      assert haEnabled;
5553      
5554      if (LOG.isDebugEnabled()) {
5555        LOG.debug("Adjusting block totals from " +
5556            blockSafe + "/" + blockTotal + " to " +
5557            (blockSafe + deltaSafe) + "/" + (blockTotal + deltaTotal));
5558      }
5559      assert blockSafe + deltaSafe >= 0 : "Can't reduce blockSafe " +
5560        blockSafe + " by " + deltaSafe + ": would be negative";
5561      assert blockTotal + deltaTotal >= 0 : "Can't reduce blockTotal " +
5562        blockTotal + " by " + deltaTotal + ": would be negative";
5563      
5564      blockSafe += deltaSafe;
5565      setBlockTotal(blockTotal + deltaTotal);
5566    }
5567  }
5568    
5569  /**
5570   * Periodically check whether it is time to leave safe mode.
5571   * This thread starts when the threshold level is reached.
5572   *
5573   */
5574  class SafeModeMonitor implements Runnable {
5575    /** interval in msec for checking safe mode: {@value} */
5576    private static final long recheckInterval = 1000;
5577      
5578    /**
5579     */
5580    @Override
5581    public void run() {
5582      while (fsRunning) {
5583        writeLock();
5584        try {
5585          if (safeMode == null) { // Not in safe mode.
5586            break;
5587          }
5588          if (safeMode.canLeave()) {
5589            // Leave safe mode.
5590            safeMode.leave();
5591            smmthread = null;
5592            break;
5593          }
5594        } finally {
5595          writeUnlock();
5596        }
5597
5598        try {
5599          Thread.sleep(recheckInterval);
5600        } catch (InterruptedException ie) {
5601          // Ignored
5602        }
5603      }
5604      if (!fsRunning) {
5605        LOG.info("NameNode is being shutdown, exit SafeModeMonitor thread");
5606      }
5607    }
5608  }
5609    
5610  boolean setSafeMode(SafeModeAction action) throws IOException {
5611    if (action != SafeModeAction.SAFEMODE_GET) {
5612      checkSuperuserPrivilege();
5613      switch(action) {
5614      case SAFEMODE_LEAVE: // leave safe mode
5615        leaveSafeMode();
5616        break;
5617      case SAFEMODE_ENTER: // enter safe mode
5618        enterSafeMode(false);
5619        break;
5620      default:
5621        LOG.error("Unexpected safe mode action");
5622      }
5623    }
5624    return isInSafeMode();
5625  }
5626
5627  @Override
5628  public void checkSafeMode() {
5629    // safeMode is volatile, and may be set to null at any time
5630    SafeModeInfo safeMode = this.safeMode;
5631    if (safeMode != null) {
5632      safeMode.checkMode();
5633    }
5634  }
5635
5636  @Override
5637  public boolean isInSafeMode() {
5638    // safeMode is volatile, and may be set to null at any time
5639    SafeModeInfo safeMode = this.safeMode;
5640    if (safeMode == null)
5641      return false;
5642    return safeMode.isOn();
5643  }
5644
5645  @Override
5646  public boolean isInStartupSafeMode() {
5647    // safeMode is volatile, and may be set to null at any time
5648    SafeModeInfo safeMode = this.safeMode;
5649    if (safeMode == null)
5650      return false;
5651    // If the NN is in safemode, and not due to manual / low resources, we
5652    // assume it must be because of startup. If the NN had low resources during
5653    // startup, we assume it came out of startup safemode and it is now in low
5654    // resources safemode
5655    return !safeMode.isManual() && !safeMode.areResourcesLow()
5656      && safeMode.isOn();
5657  }
5658
5659  /**
5660   * Check if replication queues are to be populated
5661   * @return true when node is HAState.Active and not in the very first safemode
5662   */
5663  @Override
5664  public boolean isPopulatingReplQueues() {
5665    if (!shouldPopulateReplQueues()) {
5666      return false;
5667    }
5668    return initializedReplQueues;
5669  }
5670
5671  private boolean shouldPopulateReplQueues() {
5672    if(haContext == null || haContext.getState() == null)
5673      return false;
5674    return haContext.getState().shouldPopulateReplQueues();
5675  }
5676
5677  @Override
5678  public void incrementSafeBlockCount(int replication) {
5679    // safeMode is volatile, and may be set to null at any time
5680    SafeModeInfo safeMode = this.safeMode;
5681    if (safeMode == null)
5682      return;
5683    safeMode.incrementSafeBlockCount((short)replication);
5684  }
5685
5686  @Override
5687  public void decrementSafeBlockCount(Block b) {
5688    // safeMode is volatile, and may be set to null at any time
5689    SafeModeInfo safeMode = this.safeMode;
5690    if (safeMode == null) // mostly true
5691      return;
5692    BlockInfoContiguous storedBlock = getStoredBlock(b);
5693    if (storedBlock.isComplete()) {
5694      safeMode.decrementSafeBlockCount((short)blockManager.countNodes(b).liveReplicas());
5695    }
5696  }
5697  
5698  /**
5699   * Adjust the total number of blocks safe and expected during safe mode.
5700   * If safe mode is not currently on, this is a no-op.
5701   * @param deltaSafe the change in number of safe blocks
5702   * @param deltaTotal the change i nnumber of total blocks expected
5703   */
5704  @Override
5705  public void adjustSafeModeBlockTotals(int deltaSafe, int deltaTotal) {
5706    // safeMode is volatile, and may be set to null at any time
5707    SafeModeInfo safeMode = this.safeMode;
5708    if (safeMode == null)
5709      return;
5710    safeMode.adjustBlockTotals(deltaSafe, deltaTotal);
5711  }
5712
5713  /**
5714   * Set the total number of blocks in the system. 
5715   */
5716  public void setBlockTotal() {
5717    // safeMode is volatile, and may be set to null at any time
5718    SafeModeInfo safeMode = this.safeMode;
5719    if (safeMode == null)
5720      return;
5721    safeMode.setBlockTotal((int)getCompleteBlocksTotal());
5722  }
5723
5724  /**
5725   * Get the total number of blocks in the system. 
5726   */
5727  @Override // FSNamesystemMBean
5728  @Metric
5729  public long getBlocksTotal() {
5730    return blockManager.getTotalBlocks();
5731  }
5732
5733  /**
5734   * Get the total number of COMPLETE blocks in the system.
5735   * For safe mode only complete blocks are counted.
5736   */
5737  private long getCompleteBlocksTotal() {
5738    // Calculate number of blocks under construction
5739    long numUCBlocks = 0;
5740    readLock();
5741    numUCBlocks = leaseManager.getNumUnderConstructionBlocks();
5742    try {
5743      return getBlocksTotal() - numUCBlocks;
5744    } finally {
5745      readUnlock();
5746    }
5747  }
5748
5749  /**
5750   * Enter safe mode. If resourcesLow is false, then we assume it is manual
5751   * @throws IOException
5752   */
5753  void enterSafeMode(boolean resourcesLow) throws IOException {
5754    writeLock();
5755    try {
5756      // Stop the secret manager, since rolling the master key would
5757      // try to write to the edit log
5758      stopSecretManager();
5759
5760      // Ensure that any concurrent operations have been fully synced
5761      // before entering safe mode. This ensures that the FSImage
5762      // is entirely stable on disk as soon as we're in safe mode.
5763      boolean isEditlogOpenForWrite = getEditLog().isOpenForWrite();
5764      // Before Editlog is in OpenForWrite mode, editLogStream will be null. So,
5765      // logSyncAll call can be called only when Edlitlog is in OpenForWrite mode
5766      if (isEditlogOpenForWrite) {
5767        getEditLog().logSyncAll();
5768      }
5769      if (!isInSafeMode()) {
5770        safeMode = new SafeModeInfo(resourcesLow);
5771        return;
5772      }
5773      if (resourcesLow) {
5774        safeMode.setResourcesLow();
5775      } else {
5776        safeMode.setManual();
5777      }
5778      if (isEditlogOpenForWrite) {
5779        getEditLog().logSyncAll();
5780      }
5781      NameNode.stateChangeLog.info("STATE* Safe mode is ON"
5782          + safeMode.getTurnOffTip());
5783    } finally {
5784      writeUnlock();
5785    }
5786  }
5787
5788  /**
5789   * Leave safe mode.
5790   */
5791  void leaveSafeMode() {
5792    writeLock();
5793    try {
5794      if (!isInSafeMode()) {
5795        NameNode.stateChangeLog.info("STATE* Safe mode is already OFF"); 
5796        return;
5797      }
5798      safeMode.leave();
5799    } finally {
5800      writeUnlock();
5801    }
5802  }
5803    
5804  String getSafeModeTip() {
5805    // There is no need to take readLock.
5806    // Don't use isInSafeMode as this.safeMode might be set to null.
5807    // after isInSafeMode returns.
5808    boolean inSafeMode;
5809    SafeModeInfo safeMode = this.safeMode;
5810    if (safeMode == null) {
5811      inSafeMode = false;
5812    } else {
5813      inSafeMode = safeMode.isOn();
5814    }
5815
5816    if (!inSafeMode) {
5817      return "";
5818    } else {
5819      return safeMode.getTurnOffTip();
5820    }
5821  }
5822
5823  CheckpointSignature rollEditLog() throws IOException {
5824    checkSuperuserPrivilege();
5825    checkOperation(OperationCategory.JOURNAL);
5826    writeLock();
5827    try {
5828      checkOperation(OperationCategory.JOURNAL);
5829      checkNameNodeSafeMode("Log not rolled");
5830      if (Server.isRpcInvocation()) {
5831        LOG.info("Roll Edit Log from " + Server.getRemoteAddress());
5832      }
5833      return getFSImage().rollEditLog();
5834    } finally {
5835      writeUnlock();
5836    }
5837  }
5838
5839  NamenodeCommand startCheckpoint(NamenodeRegistration backupNode,
5840      NamenodeRegistration activeNamenode) throws IOException {
5841    checkOperation(OperationCategory.CHECKPOINT);
5842    writeLock();
5843    try {
5844      checkOperation(OperationCategory.CHECKPOINT);
5845      checkNameNodeSafeMode("Checkpoint not started");
5846      
5847      LOG.info("Start checkpoint for " + backupNode.getAddress());
5848      NamenodeCommand cmd = getFSImage().startCheckpoint(backupNode,
5849          activeNamenode);
5850      getEditLog().logSync();
5851      return cmd;
5852    } finally {
5853      writeUnlock();
5854    }
5855  }
5856
5857  public void processIncrementalBlockReport(final DatanodeID nodeID,
5858      final StorageReceivedDeletedBlocks srdb)
5859      throws IOException {
5860    writeLock();
5861    try {
5862      blockManager.processIncrementalBlockReport(nodeID, srdb);
5863    } finally {
5864      writeUnlock();
5865    }
5866  }
5867  
5868  void endCheckpoint(NamenodeRegistration registration,
5869                            CheckpointSignature sig) throws IOException {
5870    checkOperation(OperationCategory.CHECKPOINT);
5871    readLock();
5872    try {
5873      checkOperation(OperationCategory.CHECKPOINT);
5874      checkNameNodeSafeMode("Checkpoint not ended");
5875      LOG.info("End checkpoint for " + registration.getAddress());
5876      getFSImage().endCheckpoint(sig);
5877    } finally {
5878      readUnlock();
5879    }
5880  }
5881
5882  PermissionStatus createFsOwnerPermissions(FsPermission permission) {
5883    return new PermissionStatus(fsOwner.getShortUserName(), supergroup, permission);
5884  }
5885
5886  private void checkUnreadableBySuperuser(FSPermissionChecker pc,
5887      INode inode, int snapshotId)
5888      throws IOException {
5889    if (pc.isSuperUser()) {
5890      for (XAttr xattr : FSDirXAttrOp.getXAttrs(dir, inode, snapshotId)) {
5891        if (XAttrHelper.getPrefixName(xattr).
5892            equals(SECURITY_XATTR_UNREADABLE_BY_SUPERUSER)) {
5893          throw new AccessControlException("Access is denied for " +
5894              pc.getUser() + " since the superuser is not allowed to " +
5895              "perform this operation.");
5896        }
5897      }
5898    }
5899  }
5900
5901  @Override
5902  public void checkSuperuserPrivilege()
5903      throws AccessControlException {
5904    if (isPermissionEnabled) {
5905      FSPermissionChecker pc = getPermissionChecker();
5906      pc.checkSuperuserPrivilege();
5907    }
5908  }
5909
5910  /**
5911   * Check to see if we have exceeded the limit on the number
5912   * of inodes.
5913   */
5914  void checkFsObjectLimit() throws IOException {
5915    if (maxFsObjects != 0 &&
5916        maxFsObjects <= dir.totalInodes() + getBlocksTotal()) {
5917      throw new IOException("Exceeded the configured number of objects " +
5918                             maxFsObjects + " in the filesystem.");
5919    }
5920  }
5921
5922  /**
5923   * Get the total number of objects in the system. 
5924   */
5925  @Override // FSNamesystemMBean
5926  public long getMaxObjects() {
5927    return maxFsObjects;
5928  }
5929
5930  @Override // FSNamesystemMBean
5931  @Metric
5932  public long getFilesTotal() {
5933    // There is no need to take fSNamesystem's lock as
5934    // FSDirectory has its own lock.
5935    return this.dir.totalInodes();
5936  }
5937
5938  @Override // FSNamesystemMBean
5939  @Metric
5940  public long getPendingReplicationBlocks() {
5941    return blockManager.getPendingReplicationBlocksCount();
5942  }
5943
5944  @Override // FSNamesystemMBean
5945  @Metric
5946  public long getUnderReplicatedBlocks() {
5947    return blockManager.getUnderReplicatedBlocksCount();
5948  }
5949
5950  /** Returns number of blocks with corrupt replicas */
5951  @Metric({"CorruptBlocks", "Number of blocks with corrupt replicas"})
5952  public long getCorruptReplicaBlocks() {
5953    return blockManager.getCorruptReplicaBlocksCount();
5954  }
5955
5956  @Override // FSNamesystemMBean
5957  @Metric
5958  public long getScheduledReplicationBlocks() {
5959    return blockManager.getScheduledReplicationBlocksCount();
5960  }
5961
5962  @Override
5963  @Metric
5964  public long getPendingDeletionBlocks() {
5965    return blockManager.getPendingDeletionBlocksCount();
5966  }
5967
5968  @Override
5969  public long getBlockDeletionStartTime() {
5970    return startTime + blockManager.getStartupDelayBlockDeletionInMs();
5971  }
5972
5973  @Metric
5974  public long getExcessBlocks() {
5975    return blockManager.getExcessBlocksCount();
5976  }
5977  
5978  // HA-only metric
5979  @Metric
5980  public long getPostponedMisreplicatedBlocks() {
5981    return blockManager.getPostponedMisreplicatedBlocksCount();
5982  }
5983
5984  // HA-only metric
5985  @Metric
5986  public int getPendingDataNodeMessageCount() {
5987    return blockManager.getPendingDataNodeMessageCount();
5988  }
5989  
5990  // HA-only metric
5991  @Metric
5992  public String getHAState() {
5993    return haContext.getState().toString();
5994  }
5995
5996  // HA-only metric
5997  @Metric
5998  public long getMillisSinceLastLoadedEdits() {
5999    if (isInStandbyState() && editLogTailer != null) {
6000      return monotonicNow() - editLogTailer.getLastLoadTimeMs();
6001    } else {
6002      return 0;
6003    }
6004  }
6005  
6006  @Metric
6007  public int getBlockCapacity() {
6008    return blockManager.getCapacity();
6009  }
6010
6011  @Override // FSNamesystemMBean
6012  public String getFSState() {
6013    return isInSafeMode() ? "safeMode" : "Operational";
6014  }
6015  
6016  private ObjectName mbeanName;
6017  private ObjectName mxbeanName;
6018
6019  /**
6020   * Register the FSNamesystem MBean using the name
6021   *        "hadoop:service=NameNode,name=FSNamesystemState"
6022   */
6023  private void registerMBean() {
6024    // We can only implement one MXBean interface, so we keep the old one.
6025    try {
6026      StandardMBean bean = new StandardMBean(this, FSNamesystemMBean.class);
6027      mbeanName = MBeans.register("NameNode", "FSNamesystemState", bean);
6028    } catch (NotCompliantMBeanException e) {
6029      throw new RuntimeException("Bad MBean setup", e);
6030    }
6031
6032    LOG.info("Registered FSNamesystemState MBean");
6033  }
6034
6035  /**
6036   * shutdown FSNamesystem
6037   */
6038  void shutdown() {
6039    if (snapshotManager != null) {
6040      snapshotManager.shutdown();
6041    }
6042    if (mbeanName != null) {
6043      MBeans.unregister(mbeanName);
6044      mbeanName = null;
6045    }
6046    if (mxbeanName != null) {
6047      MBeans.unregister(mxbeanName);
6048      mxbeanName = null;
6049    }
6050    if (dir != null) {
6051      dir.shutdown();
6052    }
6053    if (blockManager != null) {
6054      blockManager.shutdown();
6055    }
6056  }
6057
6058  @Override // FSNamesystemMBean
6059  public int getNumLiveDataNodes() {
6060    return getBlockManager().getDatanodeManager().getNumLiveDataNodes();
6061  }
6062
6063  @Override // FSNamesystemMBean
6064  public int getNumDeadDataNodes() {
6065    return getBlockManager().getDatanodeManager().getNumDeadDataNodes();
6066  }
6067  
6068  @Override // FSNamesystemMBean
6069  public int getNumDecomLiveDataNodes() {
6070    final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6071    getBlockManager().getDatanodeManager().fetchDatanodes(live, null, true);
6072    int liveDecommissioned = 0;
6073    for (DatanodeDescriptor node : live) {
6074      liveDecommissioned += node.isDecommissioned() ? 1 : 0;
6075    }
6076    return liveDecommissioned;
6077  }
6078
6079  @Override // FSNamesystemMBean
6080  public int getNumDecomDeadDataNodes() {
6081    final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
6082    getBlockManager().getDatanodeManager().fetchDatanodes(null, dead, true);
6083    int deadDecommissioned = 0;
6084    for (DatanodeDescriptor node : dead) {
6085      deadDecommissioned += node.isDecommissioned() ? 1 : 0;
6086    }
6087    return deadDecommissioned;
6088  }
6089
6090  @Override // FSNamesystemMBean
6091  public int getVolumeFailuresTotal() {
6092    List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6093    getBlockManager().getDatanodeManager().fetchDatanodes(live, null, true);
6094    int volumeFailuresTotal = 0;
6095    for (DatanodeDescriptor node: live) {
6096      volumeFailuresTotal += node.getVolumeFailures();
6097    }
6098    return volumeFailuresTotal;
6099  }
6100
6101  @Override // FSNamesystemMBean
6102  public long getEstimatedCapacityLostTotal() {
6103    List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6104    getBlockManager().getDatanodeManager().fetchDatanodes(live, null, true);
6105    long estimatedCapacityLostTotal = 0;
6106    for (DatanodeDescriptor node: live) {
6107      VolumeFailureSummary volumeFailureSummary = node.getVolumeFailureSummary();
6108      if (volumeFailureSummary != null) {
6109        estimatedCapacityLostTotal +=
6110            volumeFailureSummary.getEstimatedCapacityLostTotal();
6111      }
6112    }
6113    return estimatedCapacityLostTotal;
6114  }
6115
6116  @Override // FSNamesystemMBean
6117  public int getNumDecommissioningDataNodes() {
6118    return getBlockManager().getDatanodeManager().getDecommissioningNodes()
6119        .size();
6120  }
6121
6122  @Override // FSNamesystemMBean
6123  @Metric({"StaleDataNodes", 
6124    "Number of datanodes marked stale due to delayed heartbeat"})
6125  public int getNumStaleDataNodes() {
6126    return getBlockManager().getDatanodeManager().getNumStaleNodes();
6127  }
6128
6129  /**
6130   * Storages are marked as "content stale" after NN restart or fails over and
6131   * before NN receives the first Heartbeat followed by the first Blockreport.
6132   */
6133  @Override // FSNamesystemMBean
6134  public int getNumStaleStorages() {
6135    return getBlockManager().getDatanodeManager().getNumStaleStorages();
6136  }
6137
6138  @Override // FSNamesystemMBean
6139  public String getTopUserOpCounts() {
6140    if (!topConf.isEnabled) {
6141      return null;
6142    }
6143
6144    Date now = new Date();
6145    final List<RollingWindowManager.TopWindow> topWindows =
6146        topMetrics.getTopWindows();
6147    Map<String, Object> topMap = new TreeMap<String, Object>();
6148    topMap.put("windows", topWindows);
6149    topMap.put("timestamp", DFSUtil.dateToIso8601String(now));
6150    ObjectMapper mapper = new ObjectMapper();
6151    try {
6152      return mapper.writeValueAsString(topMap);
6153    } catch (IOException e) {
6154      LOG.warn("Failed to fetch TopUser metrics", e);
6155    }
6156    return null;
6157  }
6158
6159  /**
6160   * Increments, logs and then returns the stamp
6161   */
6162  long nextGenerationStamp(boolean legacyBlock)
6163      throws IOException, SafeModeException {
6164    assert hasWriteLock();
6165    checkNameNodeSafeMode("Cannot get next generation stamp");
6166
6167    long gs = blockIdManager.nextGenerationStamp(legacyBlock);
6168    if (legacyBlock) {
6169      getEditLog().logGenerationStampV1(gs);
6170    } else {
6171      getEditLog().logGenerationStampV2(gs);
6172    }
6173
6174    // NB: callers sync the log
6175    return gs;
6176  }
6177
6178  /**
6179   * Increments, logs and then returns the block ID
6180   */
6181  private long nextBlockId() throws IOException {
6182    assert hasWriteLock();
6183    checkNameNodeSafeMode("Cannot get next block ID");
6184    final long blockId = blockIdManager.nextBlockId();
6185    getEditLog().logAllocateBlockId(blockId);
6186    // NB: callers sync the log
6187    return blockId;
6188  }
6189
6190  private boolean isFileDeleted(INodeFile file) {
6191    // Not in the inodeMap or in the snapshot but marked deleted.
6192    if (dir.getInode(file.getId()) == null) {
6193      return true;
6194    }
6195
6196    // look at the path hierarchy to see if one parent is deleted by recursive
6197    // deletion
6198    INode tmpChild = file;
6199    INodeDirectory tmpParent = file.getParent();
6200    while (true) {
6201      if (tmpParent == null) {
6202        return true;
6203      }
6204
6205      INode childINode = tmpParent.getChild(tmpChild.getLocalNameBytes(),
6206          Snapshot.CURRENT_STATE_ID);
6207      if (childINode == null || !childINode.equals(tmpChild)) {
6208        // a newly created INode with the same name as an already deleted one
6209        // would be a different INode than the deleted one
6210        return true;
6211      }
6212
6213      if (tmpParent.isRoot()) {
6214        break;
6215      }
6216
6217      tmpChild = tmpParent;
6218      tmpParent = tmpParent.getParent();
6219    }
6220
6221    if (file.isWithSnapshot() &&
6222        file.getFileWithSnapshotFeature().isCurrentFileDeleted()) {
6223      return true;
6224    }
6225    return false;
6226  }
6227
6228  private INodeFile checkUCBlock(ExtendedBlock block,
6229      String clientName) throws IOException {
6230    assert hasWriteLock();
6231    checkNameNodeSafeMode("Cannot get a new generation stamp and an "
6232        + "access token for block " + block);
6233    
6234    // check stored block state
6235    BlockInfoContiguous storedBlock = getStoredBlock(ExtendedBlock.getLocalBlock(block));
6236    if (storedBlock == null || 
6237        storedBlock.getBlockUCState() != BlockUCState.UNDER_CONSTRUCTION) {
6238        throw new IOException(block + 
6239            " does not exist or is not under Construction" + storedBlock);
6240    }
6241    
6242    // check file inode
6243    final INodeFile file = ((INode)storedBlock.getBlockCollection()).asFile();
6244    if (file == null || !file.isUnderConstruction() || isFileDeleted(file)) {
6245      throw new IOException("The file " + storedBlock + 
6246          " belonged to does not exist or it is not under construction.");
6247    }
6248    
6249    // check lease
6250    if (clientName == null
6251        || !clientName.equals(file.getFileUnderConstructionFeature()
6252            .getClientName())) {
6253      throw new LeaseExpiredException("Lease mismatch: " + block + 
6254          " is accessed by a non lease holder " + clientName); 
6255    }
6256
6257    return file;
6258  }
6259  
6260  /**
6261   * Client is reporting some bad block locations.
6262   */
6263  void reportBadBlocks(LocatedBlock[] blocks) throws IOException {
6264    checkOperation(OperationCategory.WRITE);
6265    NameNode.stateChangeLog.info("*DIR* reportBadBlocks");
6266    writeLock();
6267    try {
6268      checkOperation(OperationCategory.WRITE);
6269      for (int i = 0; i < blocks.length; i++) {
6270        ExtendedBlock blk = blocks[i].getBlock();
6271        DatanodeInfo[] nodes = blocks[i].getLocations();
6272        String[] storageIDs = blocks[i].getStorageIDs();
6273        for (int j = 0; j < nodes.length; j++) {
6274          blockManager.findAndMarkBlockAsCorrupt(blk, nodes[j],
6275              storageIDs == null ? null: storageIDs[j], 
6276              "client machine reported it");
6277        }
6278      }
6279    } finally {
6280      writeUnlock();
6281    }
6282  }
6283
6284  /**
6285   * Get a new generation stamp together with an access token for 
6286   * a block under construction
6287   * 
6288   * This method is called for recovering a failed pipeline or setting up
6289   * a pipeline to append to a block.
6290   * 
6291   * @param block a block
6292   * @param clientName the name of a client
6293   * @return a located block with a new generation stamp and an access token
6294   * @throws IOException if any error occurs
6295   */
6296  LocatedBlock updateBlockForPipeline(ExtendedBlock block, 
6297      String clientName) throws IOException {
6298    LocatedBlock locatedBlock;
6299    checkOperation(OperationCategory.WRITE);
6300    writeLock();
6301    try {
6302      checkOperation(OperationCategory.WRITE);
6303
6304      // check vadility of parameters
6305      checkUCBlock(block, clientName);
6306  
6307      // get a new generation stamp and an access token
6308      block.setGenerationStamp(nextGenerationStamp(blockIdManager.isLegacyBlock(block.getLocalBlock())));
6309      locatedBlock = new LocatedBlock(block, new DatanodeInfo[0]);
6310      blockManager.setBlockToken(locatedBlock, AccessMode.WRITE);
6311    } finally {
6312      writeUnlock();
6313    }
6314    // Ensure we record the new generation stamp
6315    getEditLog().logSync();
6316    return locatedBlock;
6317  }
6318  
6319  /**
6320   * Update a pipeline for a block under construction
6321   * 
6322   * @param clientName the name of the client
6323   * @param oldBlock and old block
6324   * @param newBlock a new block with a new generation stamp and length
6325   * @param newNodes datanodes in the pipeline
6326   * @throws IOException if any error occurs
6327   */
6328  void updatePipeline(
6329      String clientName, ExtendedBlock oldBlock, ExtendedBlock newBlock,
6330      DatanodeID[] newNodes, String[] newStorageIDs, boolean logRetryCache)
6331      throws IOException {
6332    checkOperation(OperationCategory.WRITE);
6333
6334    LOG.info("updatePipeline(" + oldBlock.getLocalBlock()
6335             + ", newGS=" + newBlock.getGenerationStamp()
6336             + ", newLength=" + newBlock.getNumBytes()
6337             + ", newNodes=" + Arrays.asList(newNodes)
6338             + ", client=" + clientName
6339             + ")");
6340    waitForLoadingFSImage();
6341    writeLock();
6342    try {
6343      checkOperation(OperationCategory.WRITE);
6344      checkNameNodeSafeMode("Pipeline not updated");
6345      assert newBlock.getBlockId()==oldBlock.getBlockId() : newBlock + " and "
6346        + oldBlock + " has different block identifier";
6347      updatePipelineInternal(clientName, oldBlock, newBlock, newNodes,
6348          newStorageIDs, logRetryCache);
6349    } finally {
6350      writeUnlock();
6351    }
6352    getEditLog().logSync();
6353    LOG.info("updatePipeline(" + oldBlock.getLocalBlock() + " => "
6354        + newBlock.getLocalBlock() + ") success");
6355  }
6356
6357  private void updatePipelineInternal(String clientName, ExtendedBlock oldBlock,
6358      ExtendedBlock newBlock, DatanodeID[] newNodes, String[] newStorageIDs,
6359      boolean logRetryCache)
6360      throws IOException {
6361    assert hasWriteLock();
6362    // check the vadility of the block and lease holder name
6363    final INodeFile pendingFile = checkUCBlock(oldBlock, clientName);
6364    final BlockInfoContiguousUnderConstruction blockinfo
6365        = (BlockInfoContiguousUnderConstruction)pendingFile.getLastBlock();
6366
6367    // check new GS & length: this is not expected
6368    if (newBlock.getGenerationStamp() <= blockinfo.getGenerationStamp() ||
6369        newBlock.getNumBytes() < blockinfo.getNumBytes()) {
6370      String msg = "Update " + oldBlock + " (len = " + 
6371        blockinfo.getNumBytes() + ") to an older state: " + newBlock + 
6372        " (len = " + newBlock.getNumBytes() +")";
6373      LOG.warn(msg);
6374      throw new IOException(msg);
6375    }
6376
6377    // Update old block with the new generation stamp and new length
6378    blockinfo.setNumBytes(newBlock.getNumBytes());
6379    blockinfo.setGenerationStampAndVerifyReplicas(newBlock.getGenerationStamp());
6380
6381    // find the DatanodeDescriptor objects
6382    final DatanodeStorageInfo[] storages = blockManager.getDatanodeManager()
6383        .getDatanodeStorageInfos(newNodes, newStorageIDs);
6384    blockinfo.setExpectedLocations(storages);
6385
6386    String src = pendingFile.getFullPathName();
6387    persistBlocks(src, pendingFile, logRetryCache);
6388  }
6389
6390  // rename was successful. If any part of the renamed subtree had
6391  // files that were being written to, update with new filename.
6392  void unprotectedChangeLease(String src, String dst) {
6393    assert hasWriteLock();
6394    leaseManager.changeLease(src, dst);
6395  }
6396
6397  /**
6398   * Serializes leases.
6399   */
6400  void saveFilesUnderConstruction(DataOutputStream out,
6401      Map<Long, INodeFile> snapshotUCMap) throws IOException {
6402    // This is run by an inferior thread of saveNamespace, which holds a read
6403    // lock on our behalf. If we took the read lock here, we could block
6404    // for fairness if a writer is waiting on the lock.
6405    synchronized (leaseManager) {
6406      Map<String, INodeFile> nodes = leaseManager.getINodesUnderConstruction();
6407      for (Map.Entry<String, INodeFile> entry : nodes.entrySet()) {
6408        // TODO: for HDFS-5428, because of rename operations, some
6409        // under-construction files that are
6410        // in the current fs directory can also be captured in the
6411        // snapshotUCMap. We should remove them from the snapshotUCMap.
6412        snapshotUCMap.remove(entry.getValue().getId());
6413      }
6414
6415      out.writeInt(nodes.size() + snapshotUCMap.size()); // write the size
6416      for (Map.Entry<String, INodeFile> entry : nodes.entrySet()) {
6417        FSImageSerialization.writeINodeUnderConstruction(
6418            out, entry.getValue(), entry.getKey());
6419      }
6420      for (Map.Entry<Long, INodeFile> entry : snapshotUCMap.entrySet()) {
6421        // for those snapshot INodeFileUC, we use "/.reserved/.inodes/<inodeid>"
6422        // as their paths
6423        StringBuilder b = new StringBuilder();
6424        b.append(FSDirectory.DOT_RESERVED_PATH_PREFIX)
6425            .append(Path.SEPARATOR).append(FSDirectory.DOT_INODES_STRING)
6426            .append(Path.SEPARATOR).append(entry.getValue().getId());
6427        FSImageSerialization.writeINodeUnderConstruction(
6428            out, entry.getValue(), b.toString());
6429      }
6430    }
6431  }
6432
6433  /**
6434   * @return all the under-construction files in the lease map
6435   */
6436  Map<String, INodeFile> getFilesUnderConstruction() {
6437    synchronized (leaseManager) {
6438      return leaseManager.getINodesUnderConstruction();
6439    }
6440  }
6441
6442  /**
6443   * Register a Backup name-node, verifying that it belongs
6444   * to the correct namespace, and adding it to the set of
6445   * active journals if necessary.
6446   * 
6447   * @param bnReg registration of the new BackupNode
6448   * @param nnReg registration of this NameNode
6449   * @throws IOException if the namespace IDs do not match
6450   */
6451  void registerBackupNode(NamenodeRegistration bnReg,
6452      NamenodeRegistration nnReg) throws IOException {
6453    writeLock();
6454    try {
6455      if(getFSImage().getStorage().getNamespaceID() 
6456         != bnReg.getNamespaceID())
6457        throw new IOException("Incompatible namespaceIDs: "
6458            + " Namenode namespaceID = "
6459            + getFSImage().getStorage().getNamespaceID() + "; "
6460            + bnReg.getRole() +
6461            " node namespaceID = " + bnReg.getNamespaceID());
6462      if (bnReg.getRole() == NamenodeRole.BACKUP) {
6463        getFSImage().getEditLog().registerBackupNode(
6464            bnReg, nnReg);
6465      }
6466    } finally {
6467      writeUnlock();
6468    }
6469  }
6470
6471  /**
6472   * Release (unregister) backup node.
6473   * <p>
6474   * Find and remove the backup stream corresponding to the node.
6475   * @throws IOException
6476   */
6477  void releaseBackupNode(NamenodeRegistration registration)
6478    throws IOException {
6479    checkOperation(OperationCategory.WRITE);
6480    writeLock();
6481    try {
6482      checkOperation(OperationCategory.WRITE);
6483      if(getFSImage().getStorage().getNamespaceID()
6484         != registration.getNamespaceID())
6485        throw new IOException("Incompatible namespaceIDs: "
6486            + " Namenode namespaceID = "
6487            + getFSImage().getStorage().getNamespaceID() + "; "
6488            + registration.getRole() +
6489            " node namespaceID = " + registration.getNamespaceID());
6490      getEditLog().releaseBackupStream(registration);
6491    } finally {
6492      writeUnlock();
6493    }
6494  }
6495
6496  static class CorruptFileBlockInfo {
6497    final String path;
6498    final Block block;
6499    
6500    public CorruptFileBlockInfo(String p, Block b) {
6501      path = p;
6502      block = b;
6503    }
6504    
6505    @Override
6506    public String toString() {
6507      return block.getBlockName() + "\t" + path;
6508    }
6509  }
6510  /**
6511   * @param path Restrict corrupt files to this portion of namespace.
6512   * @param cookieTab Support for continuation; cookieTab  tells where
6513   *                  to start from
6514   * @return a list in which each entry describes a corrupt file/block
6515   * @throws IOException
6516   */
6517  Collection<CorruptFileBlockInfo> listCorruptFileBlocks(String path,
6518  String[] cookieTab) throws IOException {
6519    checkSuperuserPrivilege();
6520    checkOperation(OperationCategory.READ);
6521
6522    int count = 0;
6523    ArrayList<CorruptFileBlockInfo> corruptFiles =
6524        new ArrayList<CorruptFileBlockInfo>();
6525    if (cookieTab == null) {
6526      cookieTab = new String[] { null };
6527    }
6528
6529    // Do a quick check if there are any corrupt files without taking the lock
6530    if (blockManager.getMissingBlocksCount() == 0) {
6531      if (cookieTab[0] == null) {
6532        cookieTab[0] = String.valueOf(getIntCookie(cookieTab[0]));
6533      }
6534      if (LOG.isDebugEnabled()) {
6535        LOG.debug("there are no corrupt file blocks.");
6536      }
6537      return corruptFiles;
6538    }
6539
6540    readLock();
6541    try {
6542      checkOperation(OperationCategory.READ);
6543      if (!isPopulatingReplQueues()) {
6544        throw new IOException("Cannot run listCorruptFileBlocks because " +
6545                              "replication queues have not been initialized.");
6546      }
6547      // print a limited # of corrupt files per call
6548
6549      final Iterator<Block> blkIterator = blockManager.getCorruptReplicaBlockIterator();
6550
6551      int skip = getIntCookie(cookieTab[0]);
6552      for (int i = 0; i < skip && blkIterator.hasNext(); i++) {
6553        blkIterator.next();
6554      }
6555
6556      while (blkIterator.hasNext()) {
6557        Block blk = blkIterator.next();
6558        final INode inode = (INode)blockManager.getBlockCollection(blk);
6559        skip++;
6560        if (inode != null && blockManager.countNodes(blk).liveReplicas() == 0) {
6561          String src = FSDirectory.getFullPathName(inode);
6562          if (src.startsWith(path)){
6563            corruptFiles.add(new CorruptFileBlockInfo(src, blk));
6564            count++;
6565            if (count >= DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED)
6566              break;
6567          }
6568        }
6569      }
6570      cookieTab[0] = String.valueOf(skip);
6571      if (LOG.isDebugEnabled()) {
6572        LOG.debug("list corrupt file blocks returned: " + count);
6573      }
6574      return corruptFiles;
6575    } finally {
6576      readUnlock();
6577    }
6578  }
6579
6580  /**
6581   * Convert string cookie to integer.
6582   */
6583  private static int getIntCookie(String cookie){
6584    int c;
6585    if(cookie == null){
6586      c = 0;
6587    } else {
6588      try{
6589        c = Integer.parseInt(cookie);
6590      }catch (NumberFormatException e) {
6591        c = 0;
6592      }
6593    }
6594    c = Math.max(0, c);
6595    return c;
6596  }
6597
6598  /**
6599   * Create delegation token secret manager
6600   */
6601  private DelegationTokenSecretManager createDelegationTokenSecretManager(
6602      Configuration conf) {
6603    return new DelegationTokenSecretManager(conf.getLong(
6604        DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY,
6605        DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT),
6606        conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY,
6607            DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT),
6608        conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY,
6609            DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT),
6610        DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL,
6611        conf.getBoolean(DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY,
6612            DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT),
6613        this);
6614  }
6615
6616  /**
6617   * Returns the DelegationTokenSecretManager instance in the namesystem.
6618   * @return delegation token secret manager object
6619   */
6620  DelegationTokenSecretManager getDelegationTokenSecretManager() {
6621    return dtSecretManager;
6622  }
6623
6624  /**
6625   * @param renewer Renewer information
6626   * @return delegation toek
6627   * @throws IOException on error
6628   */
6629  Token<DelegationTokenIdentifier> getDelegationToken(Text renewer)
6630      throws IOException {
6631    Token<DelegationTokenIdentifier> token;
6632    checkOperation(OperationCategory.WRITE);
6633    writeLock();
6634    try {
6635      checkOperation(OperationCategory.WRITE);
6636      checkNameNodeSafeMode("Cannot issue delegation token");
6637      if (!isAllowedDelegationTokenOp()) {
6638        throw new IOException(
6639          "Delegation Token can be issued only with kerberos or web authentication");
6640      }
6641      if (dtSecretManager == null || !dtSecretManager.isRunning()) {
6642        LOG.warn("trying to get DT with no secret manager running");
6643        return null;
6644      }
6645
6646      UserGroupInformation ugi = getRemoteUser();
6647      String user = ugi.getUserName();
6648      Text owner = new Text(user);
6649      Text realUser = null;
6650      if (ugi.getRealUser() != null) {
6651        realUser = new Text(ugi.getRealUser().getUserName());
6652      }
6653      DelegationTokenIdentifier dtId = new DelegationTokenIdentifier(owner,
6654        renewer, realUser);
6655      token = new Token<DelegationTokenIdentifier>(
6656        dtId, dtSecretManager);
6657      long expiryTime = dtSecretManager.getTokenExpiryTime(dtId);
6658      getEditLog().logGetDelegationToken(dtId, expiryTime);
6659    } finally {
6660      writeUnlock();
6661    }
6662    getEditLog().logSync();
6663    return token;
6664  }
6665
6666  /**
6667   * 
6668   * @param token token to renew
6669   * @return new expiryTime of the token
6670   * @throws InvalidToken if {@code token} is invalid
6671   * @throws IOException on other errors
6672   */
6673  long renewDelegationToken(Token<DelegationTokenIdentifier> token)
6674      throws InvalidToken, IOException {
6675    long expiryTime;
6676    checkOperation(OperationCategory.WRITE);
6677    writeLock();
6678    try {
6679      checkOperation(OperationCategory.WRITE);
6680
6681      checkNameNodeSafeMode("Cannot renew delegation token");
6682      if (!isAllowedDelegationTokenOp()) {
6683        throw new IOException(
6684            "Delegation Token can be renewed only with kerberos or web authentication");
6685      }
6686      String renewer = getRemoteUser().getShortUserName();
6687      expiryTime = dtSecretManager.renewToken(token, renewer);
6688      DelegationTokenIdentifier id = new DelegationTokenIdentifier();
6689      ByteArrayInputStream buf = new ByteArrayInputStream(token.getIdentifier());
6690      DataInputStream in = new DataInputStream(buf);
6691      id.readFields(in);
6692      getEditLog().logRenewDelegationToken(id, expiryTime);
6693    } finally {
6694      writeUnlock();
6695    }
6696    getEditLog().logSync();
6697    return expiryTime;
6698  }
6699
6700  /**
6701   * 
6702   * @param token token to cancel
6703   * @throws IOException on error
6704   */
6705  void cancelDelegationToken(Token<DelegationTokenIdentifier> token)
6706      throws IOException {
6707    checkOperation(OperationCategory.WRITE);
6708    writeLock();
6709    try {
6710      checkOperation(OperationCategory.WRITE);
6711
6712      checkNameNodeSafeMode("Cannot cancel delegation token");
6713      String canceller = getRemoteUser().getUserName();
6714      DelegationTokenIdentifier id = dtSecretManager
6715        .cancelToken(token, canceller);
6716      getEditLog().logCancelDelegationToken(id);
6717    } finally {
6718      writeUnlock();
6719    }
6720    getEditLog().logSync();
6721  }
6722
6723  /**
6724   * @param out save state of the secret manager
6725   * @param sdPath String storage directory path
6726   */
6727  void saveSecretManagerStateCompat(DataOutputStream out, String sdPath)
6728      throws IOException {
6729    dtSecretManager.saveSecretManagerStateCompat(out, sdPath);
6730  }
6731
6732  SecretManagerState saveSecretManagerState() {
6733    return dtSecretManager.saveSecretManagerState();
6734  }
6735
6736  /**
6737   * @param in load the state of secret manager from input stream
6738   */
6739  void loadSecretManagerStateCompat(DataInput in) throws IOException {
6740    dtSecretManager.loadSecretManagerStateCompat(in);
6741  }
6742
6743  void loadSecretManagerState(SecretManagerSection s,
6744      List<SecretManagerSection.DelegationKey> keys,
6745      List<SecretManagerSection.PersistToken> tokens) throws IOException {
6746    dtSecretManager.loadSecretManagerState(new SecretManagerState(s, keys, tokens));
6747  }
6748
6749  /**
6750   * Log the updateMasterKey operation to edit logs
6751   * 
6752   * @param key new delegation key.
6753   */
6754  public void logUpdateMasterKey(DelegationKey key) {
6755    
6756    assert !isInSafeMode() :
6757      "this should never be called while in safemode, since we stop " +
6758      "the DT manager before entering safemode!";
6759    // No need to hold FSN lock since we don't access any internal
6760    // structures, and this is stopped before the FSN shuts itself
6761    // down, etc.
6762    getEditLog().logUpdateMasterKey(key);
6763    getEditLog().logSync();
6764  }
6765  
6766  /**
6767   * Log the cancellation of expired tokens to edit logs
6768   * 
6769   * @param id token identifier to cancel
6770   */
6771  public void logExpireDelegationToken(DelegationTokenIdentifier id) {
6772    assert !isInSafeMode() :
6773      "this should never be called while in safemode, since we stop " +
6774      "the DT manager before entering safemode!";
6775    // No need to hold FSN lock since we don't access any internal
6776    // structures, and this is stopped before the FSN shuts itself
6777    // down, etc.
6778    getEditLog().logCancelDelegationToken(id);
6779  }  
6780  
6781  private void logReassignLease(String leaseHolder, String src,
6782      String newHolder) {
6783    assert hasWriteLock();
6784    getEditLog().logReassignLease(leaseHolder, src, newHolder);
6785  }
6786  
6787  /**
6788   * 
6789   * @return true if delegation token operation is allowed
6790   */
6791  private boolean isAllowedDelegationTokenOp() throws IOException {
6792    AuthenticationMethod authMethod = getConnectionAuthenticationMethod();
6793    if (UserGroupInformation.isSecurityEnabled()
6794        && (authMethod != AuthenticationMethod.KERBEROS)
6795        && (authMethod != AuthenticationMethod.KERBEROS_SSL)
6796        && (authMethod != AuthenticationMethod.CERTIFICATE)) {
6797      return false;
6798    }
6799    return true;
6800  }
6801  
6802  /**
6803   * Returns authentication method used to establish the connection
6804   * @return AuthenticationMethod used to establish connection
6805   * @throws IOException
6806   */
6807  private AuthenticationMethod getConnectionAuthenticationMethod()
6808      throws IOException {
6809    UserGroupInformation ugi = getRemoteUser();
6810    AuthenticationMethod authMethod = ugi.getAuthenticationMethod();
6811    if (authMethod == AuthenticationMethod.PROXY) {
6812      authMethod = ugi.getRealUser().getAuthenticationMethod();
6813    }
6814    return authMethod;
6815  }
6816  
6817  /**
6818   * Client invoked methods are invoked over RPC and will be in 
6819   * RPC call context even if the client exits.
6820   */
6821  boolean isExternalInvocation() {
6822    return Server.isRpcInvocation() || NamenodeWebHdfsMethods.isWebHdfsInvocation();
6823  }
6824
6825  private static InetAddress getRemoteIp() {
6826    InetAddress ip = Server.getRemoteIp();
6827    if (ip != null) {
6828      return ip;
6829    }
6830    return NamenodeWebHdfsMethods.getRemoteIp();
6831  }
6832  
6833  // optimize ugi lookup for RPC operations to avoid a trip through
6834  // UGI.getCurrentUser which is synch'ed
6835  private static UserGroupInformation getRemoteUser() throws IOException {
6836    return NameNode.getRemoteUser();
6837  }
6838  
6839  /**
6840   * Log fsck event in the audit log 
6841   */
6842  void logFsckEvent(String src, InetAddress remoteAddress) throws IOException {
6843    if (isAuditEnabled()) {
6844      logAuditEvent(true, getRemoteUser(),
6845                    remoteAddress,
6846                    "fsck", src, null, null);
6847    }
6848  }
6849  /**
6850   * Register NameNodeMXBean
6851   */
6852  private void registerMXBean() {
6853    mxbeanName = MBeans.register("NameNode", "NameNodeInfo", this);
6854  }
6855
6856  /**
6857   * Class representing Namenode information for JMX interfaces
6858   */
6859  @Override // NameNodeMXBean
6860  public String getVersion() {
6861    return VersionInfo.getVersion() + ", r" + VersionInfo.getRevision();
6862  }
6863
6864  @Override // NameNodeMXBean
6865  public long getUsed() {
6866    return this.getCapacityUsed();
6867  }
6868
6869  @Override // NameNodeMXBean
6870  public long getFree() {
6871    return this.getCapacityRemaining();
6872  }
6873
6874  @Override // NameNodeMXBean
6875  public long getTotal() {
6876    return this.getCapacityTotal();
6877  }
6878
6879  @Override // NameNodeMXBean
6880  public String getSafemode() {
6881    if (!this.isInSafeMode())
6882      return "";
6883    return "Safe mode is ON. " + this.getSafeModeTip();
6884  }
6885
6886  @Override // NameNodeMXBean
6887  public boolean isUpgradeFinalized() {
6888    return this.getFSImage().isUpgradeFinalized();
6889  }
6890
6891  @Override // NameNodeMXBean
6892  public long getNonDfsUsedSpace() {
6893    return datanodeStatistics.getCapacityUsedNonDFS();
6894  }
6895
6896  @Override // NameNodeMXBean
6897  public float getPercentUsed() {
6898    return datanodeStatistics.getCapacityUsedPercent();
6899  }
6900
6901  @Override // NameNodeMXBean
6902  public long getBlockPoolUsedSpace() {
6903    return datanodeStatistics.getBlockPoolUsed();
6904  }
6905
6906  @Override // NameNodeMXBean
6907  public float getPercentBlockPoolUsed() {
6908    return datanodeStatistics.getPercentBlockPoolUsed();
6909  }
6910
6911  @Override // NameNodeMXBean
6912  public float getPercentRemaining() {
6913    return datanodeStatistics.getCapacityRemainingPercent();
6914  }
6915
6916  @Override // NameNodeMXBean
6917  public long getCacheCapacity() {
6918    return datanodeStatistics.getCacheCapacity();
6919  }
6920
6921  @Override // NameNodeMXBean
6922  public long getCacheUsed() {
6923    return datanodeStatistics.getCacheUsed();
6924  }
6925
6926  @Override // NameNodeMXBean
6927  public long getTotalBlocks() {
6928    return getBlocksTotal();
6929  }
6930
6931  @Override // NameNodeMXBean
6932  @Metric
6933  public long getTotalFiles() {
6934    return getFilesTotal();
6935  }
6936
6937  @Override // NameNodeMXBean
6938  public long getNumberOfMissingBlocks() {
6939    return getMissingBlocksCount();
6940  }
6941  
6942  @Override // NameNodeMXBean
6943  public long getNumberOfMissingBlocksWithReplicationFactorOne() {
6944    return getMissingReplOneBlocksCount();
6945  }
6946
6947  @Override // NameNodeMXBean
6948  public int getThreads() {
6949    return ManagementFactory.getThreadMXBean().getThreadCount();
6950  }
6951
6952  /**
6953   * Returned information is a JSON representation of map with host name as the
6954   * key and value is a map of live node attribute keys to its values
6955   */
6956  @Override // NameNodeMXBean
6957  public String getLiveNodes() {
6958    final Map<String, Map<String,Object>> info = 
6959      new HashMap<String, Map<String,Object>>();
6960    final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6961    blockManager.getDatanodeManager().fetchDatanodes(live, null, true);
6962    for (DatanodeDescriptor node : live) {
6963      ImmutableMap.Builder<String, Object> innerinfo =
6964          ImmutableMap.<String,Object>builder();
6965      innerinfo
6966          .put("infoAddr", node.getInfoAddr())
6967          .put("infoSecureAddr", node.getInfoSecureAddr())
6968          .put("xferaddr", node.getXferAddr())
6969          .put("lastContact", getLastContact(node))
6970          .put("usedSpace", getDfsUsed(node))
6971          .put("adminState", node.getAdminState().toString())
6972          .put("nonDfsUsedSpace", node.getNonDfsUsed())
6973          .put("capacity", node.getCapacity())
6974          .put("numBlocks", node.numBlocks())
6975          .put("version", node.getSoftwareVersion())
6976          .put("used", node.getDfsUsed())
6977          .put("remaining", node.getRemaining())
6978          .put("blockScheduled", node.getBlocksScheduled())
6979          .put("blockPoolUsed", node.getBlockPoolUsed())
6980          .put("blockPoolUsedPercent", node.getBlockPoolUsedPercent())
6981          .put("volfails", node.getVolumeFailures());
6982      VolumeFailureSummary volumeFailureSummary = node.getVolumeFailureSummary();
6983      if (volumeFailureSummary != null) {
6984        innerinfo
6985            .put("failedStorageLocations",
6986                volumeFailureSummary.getFailedStorageLocations())
6987            .put("lastVolumeFailureDate",
6988                volumeFailureSummary.getLastVolumeFailureDate())
6989            .put("estimatedCapacityLostTotal",
6990                volumeFailureSummary.getEstimatedCapacityLostTotal());
6991      }
6992      info.put(node.getHostName() + ":" + node.getXferPort(), innerinfo.build());
6993    }
6994    return JSON.toString(info);
6995  }
6996
6997  /**
6998   * Returned information is a JSON representation of map with host name as the
6999   * key and value is a map of dead node attribute keys to its values
7000   */
7001  @Override // NameNodeMXBean
7002  public String getDeadNodes() {
7003    final Map<String, Map<String, Object>> info = 
7004      new HashMap<String, Map<String, Object>>();
7005    final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
7006    blockManager.getDatanodeManager().fetchDatanodes(null, dead, true);
7007    for (DatanodeDescriptor node : dead) {
7008      Map<String, Object> innerinfo = ImmutableMap.<String, Object>builder()
7009          .put("lastContact", getLastContact(node))
7010          .put("decommissioned", node.isDecommissioned())
7011          .put("xferaddr", node.getXferAddr())
7012          .build();
7013      info.put(node.getHostName() + ":" + node.getXferPort(), innerinfo);
7014    }
7015    return JSON.toString(info);
7016  }
7017
7018  /**
7019   * Returned information is a JSON representation of map with host name as the
7020   * key and value is a map of decommissioning node attribute keys to its
7021   * values
7022   */
7023  @Override // NameNodeMXBean
7024  public String getDecomNodes() {
7025    final Map<String, Map<String, Object>> info = 
7026      new HashMap<String, Map<String, Object>>();
7027    final List<DatanodeDescriptor> decomNodeList = blockManager.getDatanodeManager(
7028        ).getDecommissioningNodes();
7029    for (DatanodeDescriptor node : decomNodeList) {
7030      Map<String, Object> innerinfo = ImmutableMap
7031          .<String, Object> builder()
7032          .put("xferaddr", node.getXferAddr())
7033          .put("underReplicatedBlocks",
7034              node.decommissioningStatus.getUnderReplicatedBlocks())
7035          .put("decommissionOnlyReplicas",
7036              node.decommissioningStatus.getDecommissionOnlyReplicas())
7037          .put("underReplicateInOpenFiles",
7038              node.decommissioningStatus.getUnderReplicatedInOpenFiles())
7039          .build();
7040      info.put(node.getHostName() + ":" + node.getXferPort(), innerinfo);
7041    }
7042    return JSON.toString(info);
7043  }
7044
7045  private long getLastContact(DatanodeDescriptor alivenode) {
7046    return (monotonicNow() - alivenode.getLastUpdateMonotonic())/1000;
7047  }
7048
7049  private long getDfsUsed(DatanodeDescriptor alivenode) {
7050    return alivenode.getDfsUsed();
7051  }
7052
7053  @Override  // NameNodeMXBean
7054  public String getClusterId() {
7055    return getFSImage().getStorage().getClusterID();
7056  }
7057  
7058  @Override  // NameNodeMXBean
7059  public String getBlockPoolId() {
7060    return blockPoolId;
7061  }
7062  
7063  @Override  // NameNodeMXBean
7064  public String getNameDirStatuses() {
7065    Map<String, Map<File, StorageDirType>> statusMap =
7066      new HashMap<String, Map<File, StorageDirType>>();
7067    
7068    Map<File, StorageDirType> activeDirs = new HashMap<File, StorageDirType>();
7069    for (Iterator<StorageDirectory> it
7070        = getFSImage().getStorage().dirIterator(); it.hasNext();) {
7071      StorageDirectory st = it.next();
7072      activeDirs.put(st.getRoot(), st.getStorageDirType());
7073    }
7074    statusMap.put("active", activeDirs);
7075    
7076    List<Storage.StorageDirectory> removedStorageDirs
7077        = getFSImage().getStorage().getRemovedStorageDirs();
7078    Map<File, StorageDirType> failedDirs = new HashMap<File, StorageDirType>();
7079    for (StorageDirectory st : removedStorageDirs) {
7080      failedDirs.put(st.getRoot(), st.getStorageDirType());
7081    }
7082    statusMap.put("failed", failedDirs);
7083    
7084    return JSON.toString(statusMap);
7085  }
7086
7087  @Override // NameNodeMXBean
7088  public String getNodeUsage() {
7089    float median = 0;
7090    float max = 0;
7091    float min = 0;
7092    float dev = 0;
7093
7094    final Map<String, Map<String,Object>> info =
7095        new HashMap<String, Map<String,Object>>();
7096    final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
7097    blockManager.getDatanodeManager().fetchDatanodes(live, null, true);
7098
7099    if (live.size() > 0) {
7100      float totalDfsUsed = 0;
7101      float[] usages = new float[live.size()];
7102      int i = 0;
7103      for (DatanodeDescriptor dn : live) {
7104        usages[i++] = dn.getDfsUsedPercent();
7105        totalDfsUsed += dn.getDfsUsedPercent();
7106      }
7107      totalDfsUsed /= live.size();
7108      Arrays.sort(usages);
7109      median = usages[usages.length / 2];
7110      max = usages[usages.length - 1];
7111      min = usages[0];
7112
7113      for (i = 0; i < usages.length; i++) {
7114        dev += (usages[i] - totalDfsUsed) * (usages[i] - totalDfsUsed);
7115      }
7116      dev = (float) Math.sqrt(dev / usages.length);
7117    }
7118
7119    final Map<String, Object> innerInfo = new HashMap<String, Object>();
7120    innerInfo.put("min", StringUtils.format("%.2f%%", min));
7121    innerInfo.put("median", StringUtils.format("%.2f%%", median));
7122    innerInfo.put("max", StringUtils.format("%.2f%%", max));
7123    innerInfo.put("stdDev", StringUtils.format("%.2f%%", dev));
7124    info.put("nodeUsage", innerInfo);
7125
7126    return JSON.toString(info);
7127  }
7128
7129  @Override  // NameNodeMXBean
7130  public String getNameJournalStatus() {
7131    List<Map<String, String>> jasList = new ArrayList<Map<String, String>>();
7132    FSEditLog log = getFSImage().getEditLog();
7133    if (log != null) {
7134      boolean openForWrite = log.isOpenForWrite();
7135      for (JournalAndStream jas : log.getJournals()) {
7136        final Map<String, String> jasMap = new HashMap<String, String>();
7137        String manager = jas.getManager().toString();
7138
7139        jasMap.put("required", String.valueOf(jas.isRequired()));
7140        jasMap.put("disabled", String.valueOf(jas.isDisabled()));
7141        jasMap.put("manager", manager);
7142
7143        if (jas.isDisabled()) {
7144          jasMap.put("stream", "Failed");
7145        } else if (openForWrite) {
7146          EditLogOutputStream elos = jas.getCurrentStream();
7147          if (elos != null) {
7148            jasMap.put("stream", elos.generateReport());
7149          } else {
7150            jasMap.put("stream", "not currently writing");
7151          }
7152        } else {
7153          jasMap.put("stream", "open for read");
7154        }
7155        jasList.add(jasMap);
7156      }
7157    }
7158    return JSON.toString(jasList);
7159  }
7160
7161  @Override // NameNodeMxBean
7162  public String getJournalTransactionInfo() {
7163    Map<String, String> txnIdMap = new HashMap<String, String>();
7164    txnIdMap.put("LastAppliedOrWrittenTxId",
7165        Long.toString(this.getFSImage().getLastAppliedOrWrittenTxId()));
7166    txnIdMap.put("MostRecentCheckpointTxId",
7167        Long.toString(this.getFSImage().getMostRecentCheckpointTxId()));
7168    return JSON.toString(txnIdMap);
7169  }
7170  
7171  @Override  // NameNodeMXBean
7172  public String getNNStarted() {
7173    return getStartTime().toString();
7174  }
7175
7176  @Override  // NameNodeMXBean
7177  public String getCompileInfo() {
7178    return VersionInfo.getDate() + " by " + VersionInfo.getUser() +
7179        " from " + VersionInfo.getBranch();
7180  }
7181
7182  /** @return the block manager. */
7183  public BlockManager getBlockManager() {
7184    return blockManager;
7185  }
7186
7187  public BlockIdManager getBlockIdManager() {
7188    return blockIdManager;
7189  }
7190
7191  /** @return the FSDirectory. */
7192  public FSDirectory getFSDirectory() {
7193    return dir;
7194  }
7195  /** Set the FSDirectory. */
7196  @VisibleForTesting
7197  public void setFSDirectory(FSDirectory dir) {
7198    this.dir = dir;
7199  }
7200  /** @return the cache manager. */
7201  public CacheManager getCacheManager() {
7202    return cacheManager;
7203  }
7204
7205  @Override  // NameNodeMXBean
7206  public String getCorruptFiles() {
7207    List<String> list = new ArrayList<String>();
7208    Collection<FSNamesystem.CorruptFileBlockInfo> corruptFileBlocks;
7209    try {
7210      corruptFileBlocks = listCorruptFileBlocks("/", null);
7211      int corruptFileCount = corruptFileBlocks.size();
7212      if (corruptFileCount != 0) {
7213        for (FSNamesystem.CorruptFileBlockInfo c : corruptFileBlocks) {
7214          list.add(c.toString());
7215        }
7216      }
7217    } catch (IOException e) {
7218      LOG.warn("Get corrupt file blocks returned error: " + e.getMessage());
7219    }
7220    return JSON.toString(list);
7221  }
7222
7223  @Override  //NameNodeMXBean
7224  public int getDistinctVersionCount() {
7225    return blockManager.getDatanodeManager().getDatanodesSoftwareVersions()
7226      .size();
7227  }
7228
7229  @Override  //NameNodeMXBean
7230  public Map<String, Integer> getDistinctVersions() {
7231    return blockManager.getDatanodeManager().getDatanodesSoftwareVersions();
7232  }
7233
7234  @Override  //NameNodeMXBean
7235  public String getSoftwareVersion() {
7236    return VersionInfo.getVersion();
7237  }
7238
7239  /**
7240   * Verifies that the given identifier and password are valid and match.
7241   * @param identifier Token identifier.
7242   * @param password Password in the token.
7243   */
7244  public synchronized void verifyToken(DelegationTokenIdentifier identifier,
7245      byte[] password) throws InvalidToken, RetriableException {
7246    try {
7247      getDelegationTokenSecretManager().verifyToken(identifier, password);
7248    } catch (InvalidToken it) {
7249      if (inTransitionToActive()) {
7250        throw new RetriableException(it);
7251      }
7252      throw it;
7253    }
7254  }
7255  
7256  @Override
7257  public boolean isGenStampInFuture(Block block) {
7258    return blockIdManager.isGenStampInFuture(block);
7259  }
7260
7261  @VisibleForTesting
7262  public EditLogTailer getEditLogTailer() {
7263    return editLogTailer;
7264  }
7265  
7266  @VisibleForTesting
7267  public void setEditLogTailerForTests(EditLogTailer tailer) {
7268    this.editLogTailer = tailer;
7269  }
7270  
7271  @VisibleForTesting
7272  void setFsLockForTests(ReentrantReadWriteLock lock) {
7273    this.fsLock.coarseLock = lock;
7274  }
7275  
7276  @VisibleForTesting
7277  public ReentrantReadWriteLock getFsLockForTests() {
7278    return fsLock.coarseLock;
7279  }
7280  
7281  @VisibleForTesting
7282  public ReentrantLock getCpLockForTests() {
7283    return cpLock;
7284  }
7285
7286  @VisibleForTesting
7287  public SafeModeInfo getSafeModeInfoForTests() {
7288    return safeMode;
7289  }
7290  
7291  @VisibleForTesting
7292  public void setNNResourceChecker(NameNodeResourceChecker nnResourceChecker) {
7293    this.nnResourceChecker = nnResourceChecker;
7294  }
7295
7296  public SnapshotManager getSnapshotManager() {
7297    return snapshotManager;
7298  }
7299  
7300  /** Allow snapshot on a directory. */
7301  void allowSnapshot(String path) throws IOException {
7302    checkOperation(OperationCategory.WRITE);
7303    boolean success = false;
7304    writeLock();
7305    try {
7306      checkOperation(OperationCategory.WRITE);
7307      checkNameNodeSafeMode("Cannot allow snapshot for " + path);
7308      checkSuperuserPrivilege();
7309      FSDirSnapshotOp.allowSnapshot(dir, snapshotManager, path);
7310      success = true;
7311    } finally {
7312      writeUnlock();
7313    }
7314    getEditLog().logSync();
7315    logAuditEvent(success, "allowSnapshot", path, null, null);
7316  }
7317  
7318  /** Disallow snapshot on a directory. */
7319  void disallowSnapshot(String path) throws IOException {
7320    checkOperation(OperationCategory.WRITE);
7321    boolean success = false;
7322    writeLock();
7323    try {
7324      checkOperation(OperationCategory.WRITE);
7325      checkNameNodeSafeMode("Cannot disallow snapshot for " + path);
7326      checkSuperuserPrivilege();
7327      FSDirSnapshotOp.disallowSnapshot(dir, snapshotManager, path);
7328      success = true;
7329    } finally {
7330      writeUnlock();
7331    }
7332    getEditLog().logSync();
7333    logAuditEvent(success, "disallowSnapshot", path, null, null);
7334  }
7335  
7336  /**
7337   * Create a snapshot
7338   * @param snapshotRoot The directory path where the snapshot is taken
7339   * @param snapshotName The name of the snapshot
7340   */
7341  String createSnapshot(String snapshotRoot, String snapshotName,
7342                        boolean logRetryCache) throws IOException {
7343    String snapshotPath = null;
7344    writeLock();
7345    try {
7346      checkOperation(OperationCategory.WRITE);
7347      checkNameNodeSafeMode("Cannot create snapshot for " + snapshotRoot);
7348      snapshotPath = FSDirSnapshotOp.createSnapshot(dir,
7349          snapshotManager, snapshotRoot, snapshotName, logRetryCache);
7350    } finally {
7351      writeUnlock();
7352    }
7353    getEditLog().logSync();
7354    logAuditEvent(snapshotPath != null, "createSnapshot", snapshotRoot,
7355        snapshotPath, null);
7356    return snapshotPath;
7357  }
7358  
7359  /**
7360   * Rename a snapshot
7361   * @param path The directory path where the snapshot was taken
7362   * @param snapshotOldName Old snapshot name
7363   * @param snapshotNewName New snapshot name
7364   * @throws SafeModeException
7365   * @throws IOException 
7366   */
7367  void renameSnapshot(
7368      String path, String snapshotOldName, String snapshotNewName,
7369      boolean logRetryCache) throws IOException {
7370    checkOperation(OperationCategory.WRITE);
7371    boolean success = false;
7372    writeLock();
7373    try {
7374      checkOperation(OperationCategory.WRITE);
7375      checkNameNodeSafeMode("Cannot rename snapshot for " + path);
7376      FSDirSnapshotOp.renameSnapshot(dir, snapshotManager, path,
7377          snapshotOldName, snapshotNewName, logRetryCache);
7378      success = true;
7379    } finally {
7380      writeUnlock();
7381    }
7382    getEditLog().logSync();
7383    String oldSnapshotRoot = Snapshot.getSnapshotPath(path, snapshotOldName);
7384    String newSnapshotRoot = Snapshot.getSnapshotPath(path, snapshotNewName);
7385    logAuditEvent(success, "renameSnapshot", oldSnapshotRoot,
7386        newSnapshotRoot, null);
7387  }
7388  
7389  /**
7390   * Get the list of snapshottable directories that are owned 
7391   * by the current user. Return all the snapshottable directories if the 
7392   * current user is a super user.
7393   * @return The list of all the current snapshottable directories
7394   * @throws IOException
7395   */
7396  public SnapshottableDirectoryStatus[] getSnapshottableDirListing()
7397      throws IOException {
7398    SnapshottableDirectoryStatus[] status = null;
7399    checkOperation(OperationCategory.READ);
7400    boolean success = false;
7401    readLock();
7402    try {
7403      checkOperation(OperationCategory.READ);
7404      status = FSDirSnapshotOp.getSnapshottableDirListing(dir, snapshotManager);
7405      success = true;
7406    } finally {
7407      readUnlock();
7408    }
7409    logAuditEvent(success, "listSnapshottableDirectory", null, null, null);
7410    return status;
7411  }
7412  
7413  /**
7414   * Get the difference between two snapshots (or between a snapshot and the
7415   * current status) of a snapshottable directory.
7416   * 
7417   * @param path The full path of the snapshottable directory.
7418   * @param fromSnapshot Name of the snapshot to calculate the diff from. Null
7419   *          or empty string indicates the current tree.
7420   * @param toSnapshot Name of the snapshot to calculated the diff to. Null or
7421   *          empty string indicates the current tree.
7422   * @return A report about the difference between {@code fromSnapshot} and 
7423   *         {@code toSnapshot}. Modified/deleted/created/renamed files and 
7424   *         directories belonging to the snapshottable directories are listed 
7425   *         and labeled as M/-/+/R respectively. 
7426   * @throws IOException
7427   */
7428  SnapshotDiffReport getSnapshotDiffReport(String path,
7429      String fromSnapshot, String toSnapshot) throws IOException {
7430    SnapshotDiffReport diffs = null;
7431    checkOperation(OperationCategory.READ);
7432    readLock();
7433    try {
7434      checkOperation(OperationCategory.READ);
7435      diffs = FSDirSnapshotOp.getSnapshotDiffReport(dir, snapshotManager,
7436          path, fromSnapshot, toSnapshot);
7437    } finally {
7438      readUnlock();
7439    }
7440
7441    logAuditEvent(diffs != null, "computeSnapshotDiff", null, null, null);
7442    return diffs;
7443  }
7444  
7445  /**
7446   * Delete a snapshot of a snapshottable directory
7447   * @param snapshotRoot The snapshottable directory
7448   * @param snapshotName The name of the to-be-deleted snapshot
7449   * @throws SafeModeException
7450   * @throws IOException
7451   */
7452  void deleteSnapshot(String snapshotRoot, String snapshotName,
7453      boolean logRetryCache) throws IOException {
7454    checkOperation(OperationCategory.WRITE);
7455    boolean success = false;
7456    writeLock();
7457    BlocksMapUpdateInfo blocksToBeDeleted = null;
7458    try {
7459      checkOperation(OperationCategory.WRITE);
7460      checkNameNodeSafeMode("Cannot delete snapshot for " + snapshotRoot);
7461
7462      blocksToBeDeleted = FSDirSnapshotOp.deleteSnapshot(dir, snapshotManager,
7463          snapshotRoot, snapshotName, logRetryCache);
7464      success = true;
7465    } finally {
7466      writeUnlock();
7467    }
7468    getEditLog().logSync();
7469
7470    // Breaking the pattern as removing blocks have to happen outside of the
7471    // global lock
7472    if (blocksToBeDeleted != null) {
7473      removeBlocks(blocksToBeDeleted);
7474    }
7475
7476    String rootPath = Snapshot.getSnapshotPath(snapshotRoot, snapshotName);
7477    logAuditEvent(success, "deleteSnapshot", rootPath, null, null);
7478  }
7479
7480  /**
7481   * Remove a list of INodeDirectorySnapshottable from the SnapshotManager
7482   * @param toRemove the list of INodeDirectorySnapshottable to be removed
7483   */
7484  void removeSnapshottableDirs(List<INodeDirectory> toRemove) {
7485    if (snapshotManager != null) {
7486      snapshotManager.removeSnapshottable(toRemove);
7487    }
7488  }
7489
7490  RollingUpgradeInfo queryRollingUpgrade() throws IOException {
7491    checkSuperuserPrivilege();
7492    checkOperation(OperationCategory.READ);
7493    readLock();
7494    try {
7495      if (rollingUpgradeInfo != null) {
7496        boolean hasRollbackImage = this.getFSImage().hasRollbackFSImage();
7497        rollingUpgradeInfo.setCreatedRollbackImages(hasRollbackImage);
7498      }
7499      return rollingUpgradeInfo;
7500    } finally {
7501      readUnlock();
7502    }
7503  }
7504
7505  RollingUpgradeInfo startRollingUpgrade() throws IOException {
7506    checkSuperuserPrivilege();
7507    checkOperation(OperationCategory.WRITE);
7508    writeLock();
7509    try {
7510      checkOperation(OperationCategory.WRITE);
7511      if (isRollingUpgrade()) {
7512        return rollingUpgradeInfo;
7513      }
7514      long startTime = now();
7515      if (!haEnabled) { // for non-HA, we require NN to be in safemode
7516        startRollingUpgradeInternalForNonHA(startTime);
7517      } else { // for HA, NN cannot be in safemode
7518        checkNameNodeSafeMode("Failed to start rolling upgrade");
7519        startRollingUpgradeInternal(startTime);
7520      }
7521
7522      getEditLog().logStartRollingUpgrade(rollingUpgradeInfo.getStartTime());
7523      if (haEnabled) {
7524        // roll the edit log to make sure the standby NameNode can tail
7525        getFSImage().rollEditLog();
7526      }
7527    } finally {
7528      writeUnlock();
7529    }
7530
7531    getEditLog().logSync();
7532    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
7533      logAuditEvent(true, "startRollingUpgrade", null, null, null);
7534    }
7535    return rollingUpgradeInfo;
7536  }
7537
7538  /**
7539   * Update internal state to indicate that a rolling upgrade is in progress.
7540   * @param startTime rolling upgrade start time
7541   */
7542  void startRollingUpgradeInternal(long startTime)
7543      throws IOException {
7544    checkRollingUpgrade("start rolling upgrade");
7545    getFSImage().checkUpgrade();
7546    setRollingUpgradeInfo(false, startTime);
7547  }
7548
7549  /**
7550   * Update internal state to indicate that a rolling upgrade is in progress for
7551   * non-HA setup. This requires the namesystem is in SafeMode and after doing a
7552   * checkpoint for rollback the namesystem will quit the safemode automatically 
7553   */
7554  private void startRollingUpgradeInternalForNonHA(long startTime)
7555      throws IOException {
7556    Preconditions.checkState(!haEnabled);
7557    if (!isInSafeMode()) {
7558      throw new IOException("Safe mode should be turned ON "
7559          + "in order to create namespace image.");
7560    }
7561    checkRollingUpgrade("start rolling upgrade");
7562    getFSImage().checkUpgrade();
7563    // in non-HA setup, we do an extra checkpoint to generate a rollback image
7564    getFSImage().saveNamespace(this, NameNodeFile.IMAGE_ROLLBACK, null);
7565    LOG.info("Successfully saved namespace for preparing rolling upgrade.");
7566
7567    // leave SafeMode automatically
7568    setSafeMode(SafeModeAction.SAFEMODE_LEAVE);
7569    setRollingUpgradeInfo(true, startTime);
7570  }
7571
7572  void setRollingUpgradeInfo(boolean createdRollbackImages, long startTime) {
7573    rollingUpgradeInfo = new RollingUpgradeInfo(blockPoolId,
7574        createdRollbackImages, startTime, 0L);
7575  }
7576
7577  public void setCreatedRollbackImages(boolean created) {
7578    if (rollingUpgradeInfo != null) {
7579      rollingUpgradeInfo.setCreatedRollbackImages(created);
7580    }
7581  }
7582
7583  public RollingUpgradeInfo getRollingUpgradeInfo() {
7584    return rollingUpgradeInfo;
7585  }
7586
7587  public boolean isNeedRollbackFsImage() {
7588    return needRollbackFsImage;
7589  }
7590
7591  public void setNeedRollbackFsImage(boolean needRollbackFsImage) {
7592    this.needRollbackFsImage = needRollbackFsImage;
7593  }
7594
7595  @Override  // NameNodeMXBean
7596  public RollingUpgradeInfo.Bean getRollingUpgradeStatus() {
7597    if (!isRollingUpgrade()) {
7598      return null;
7599    }
7600    RollingUpgradeInfo upgradeInfo = getRollingUpgradeInfo();
7601    if (upgradeInfo.createdRollbackImages()) {
7602      return new RollingUpgradeInfo.Bean(upgradeInfo);
7603    }
7604    readLock();
7605    try {
7606      // check again after acquiring the read lock.
7607      upgradeInfo = getRollingUpgradeInfo();
7608      if (upgradeInfo == null) {
7609        return null;
7610      }
7611      if (!upgradeInfo.createdRollbackImages()) {
7612        boolean hasRollbackImage = this.getFSImage().hasRollbackFSImage();
7613        upgradeInfo.setCreatedRollbackImages(hasRollbackImage);
7614      }
7615    } catch (IOException ioe) {
7616      LOG.warn("Encountered exception setting Rollback Image", ioe);
7617    } finally {
7618      readUnlock();
7619    }
7620    return new RollingUpgradeInfo.Bean(upgradeInfo);
7621  }
7622
7623  /** Is rolling upgrade in progress? */
7624  public boolean isRollingUpgrade() {
7625    return rollingUpgradeInfo != null;
7626  }
7627
7628  void checkRollingUpgrade(String action) throws RollingUpgradeException {
7629    if (isRollingUpgrade()) {
7630      throw new RollingUpgradeException("Failed to " + action
7631          + " since a rolling upgrade is already in progress."
7632          + " Existing rolling upgrade info:\n" + rollingUpgradeInfo);
7633    }
7634  }
7635
7636  RollingUpgradeInfo finalizeRollingUpgrade() throws IOException {
7637    checkSuperuserPrivilege();
7638    checkOperation(OperationCategory.WRITE);
7639    writeLock();
7640    final RollingUpgradeInfo returnInfo;
7641    try {
7642      checkOperation(OperationCategory.WRITE);
7643      if (!isRollingUpgrade()) {
7644        return null;
7645      }
7646      checkNameNodeSafeMode("Failed to finalize rolling upgrade");
7647
7648      returnInfo = finalizeRollingUpgradeInternal(now());
7649      getEditLog().logFinalizeRollingUpgrade(returnInfo.getFinalizeTime());
7650      if (haEnabled) {
7651        // roll the edit log to make sure the standby NameNode can tail
7652        getFSImage().rollEditLog();
7653      }
7654      getFSImage().updateStorageVersion();
7655      getFSImage().renameCheckpoint(NameNodeFile.IMAGE_ROLLBACK,
7656          NameNodeFile.IMAGE);
7657    } finally {
7658      writeUnlock();
7659    }
7660
7661    if (!haEnabled) {
7662      // Sync not needed for ha since the edit was rolled after logging.
7663      getEditLog().logSync();
7664    }
7665
7666    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
7667      logAuditEvent(true, "finalizeRollingUpgrade", null, null, null);
7668    }
7669    return returnInfo;
7670  }
7671
7672  RollingUpgradeInfo finalizeRollingUpgradeInternal(long finalizeTime)
7673      throws RollingUpgradeException {
7674    final long startTime = rollingUpgradeInfo.getStartTime();
7675    rollingUpgradeInfo = null;
7676    return new RollingUpgradeInfo(blockPoolId, false, startTime, finalizeTime);
7677  }
7678
7679  long addCacheDirective(CacheDirectiveInfo directive,
7680                         EnumSet<CacheFlag> flags, boolean logRetryCache)
7681      throws IOException {
7682    checkOperation(OperationCategory.WRITE);
7683    CacheDirectiveInfo effectiveDirective = null;
7684    if (!flags.contains(CacheFlag.FORCE)) {
7685      cacheManager.waitForRescanIfNeeded();
7686    }
7687    writeLock();
7688    try {
7689      checkOperation(OperationCategory.WRITE);
7690      if (isInSafeMode()) {
7691        throw new SafeModeException(
7692            "Cannot add cache directive", safeMode);
7693      }
7694      effectiveDirective = FSNDNCacheOp.addCacheDirective(this, cacheManager,
7695          directive, flags, logRetryCache);
7696    } finally {
7697      writeUnlock();
7698      boolean success = effectiveDirective != null;
7699      if (success) {
7700        getEditLog().logSync();
7701      }
7702
7703      String effectiveDirectiveStr = effectiveDirective != null ?
7704          effectiveDirective.toString() : null;
7705      logAuditEvent(success, "addCacheDirective", effectiveDirectiveStr,
7706          null, null);
7707    }
7708    return effectiveDirective != null ? effectiveDirective.getId() : 0;
7709  }
7710
7711  void modifyCacheDirective(CacheDirectiveInfo directive,
7712      EnumSet<CacheFlag> flags, boolean logRetryCache) throws IOException {
7713    checkOperation(OperationCategory.WRITE);
7714    boolean success = false;
7715    if (!flags.contains(CacheFlag.FORCE)) {
7716      cacheManager.waitForRescanIfNeeded();
7717    }
7718    writeLock();
7719    try {
7720      checkOperation(OperationCategory.WRITE);
7721      if (isInSafeMode()) {
7722        throw new SafeModeException(
7723            "Cannot add cache directive", safeMode);
7724      }
7725      FSNDNCacheOp.modifyCacheDirective(this, cacheManager, directive, flags,
7726          logRetryCache);
7727      success = true;
7728    } finally {
7729      writeUnlock();
7730      if (success) {
7731        getEditLog().logSync();
7732      }
7733      String idStr = "{id: " + directive.getId().toString() + "}";
7734      logAuditEvent(success, "modifyCacheDirective", idStr,
7735          directive.toString(), null);
7736    }
7737  }
7738
7739  void removeCacheDirective(long id, boolean logRetryCache) throws IOException {
7740    checkOperation(OperationCategory.WRITE);
7741    boolean success = false;
7742    writeLock();
7743    try {
7744      checkOperation(OperationCategory.WRITE);
7745      if (isInSafeMode()) {
7746        throw new SafeModeException(
7747            "Cannot remove cache directives", safeMode);
7748      }
7749      FSNDNCacheOp.removeCacheDirective(this, cacheManager, id, logRetryCache);
7750      success = true;
7751    } finally {
7752      writeUnlock();
7753      String idStr = "{id: " + Long.toString(id) + "}";
7754      logAuditEvent(success, "removeCacheDirective", idStr, null,
7755          null);
7756    }
7757    getEditLog().logSync();
7758  }
7759
7760  BatchedListEntries<CacheDirectiveEntry> listCacheDirectives(
7761      long startId, CacheDirectiveInfo filter) throws IOException {
7762    checkOperation(OperationCategory.READ);
7763    BatchedListEntries<CacheDirectiveEntry> results;
7764    cacheManager.waitForRescanIfNeeded();
7765    readLock();
7766    boolean success = false;
7767    try {
7768      checkOperation(OperationCategory.READ);
7769      results = FSNDNCacheOp.listCacheDirectives(this, cacheManager, startId,
7770          filter);
7771      success = true;
7772    } finally {
7773      readUnlock();
7774      logAuditEvent(success, "listCacheDirectives", filter.toString(), null,
7775          null);
7776    }
7777    return results;
7778  }
7779
7780  void addCachePool(CachePoolInfo req, boolean logRetryCache)
7781      throws IOException {
7782    checkOperation(OperationCategory.WRITE);
7783    writeLock();
7784    boolean success = false;
7785    String poolInfoStr = null;
7786    try {
7787      checkOperation(OperationCategory.WRITE);
7788      if (isInSafeMode()) {
7789        throw new SafeModeException(
7790            "Cannot add cache pool " + req.getPoolName(), safeMode);
7791      }
7792      CachePoolInfo info = FSNDNCacheOp.addCachePool(this, cacheManager, req,
7793          logRetryCache);
7794      poolInfoStr = info.toString();
7795      success = true;
7796    } finally {
7797      writeUnlock();
7798      logAuditEvent(success, "addCachePool", poolInfoStr, null, null);
7799    }
7800    
7801    getEditLog().logSync();
7802  }
7803
7804  void modifyCachePool(CachePoolInfo req, boolean logRetryCache)
7805      throws IOException {
7806    checkOperation(OperationCategory.WRITE);
7807    writeLock();
7808    boolean success = false;
7809    try {
7810      checkOperation(OperationCategory.WRITE);
7811      if (isInSafeMode()) {
7812        throw new SafeModeException(
7813            "Cannot modify cache pool " + req.getPoolName(), safeMode);
7814      }
7815      FSNDNCacheOp.modifyCachePool(this, cacheManager, req, logRetryCache);
7816      success = true;
7817    } finally {
7818      writeUnlock();
7819      String poolNameStr = "{poolName: " +
7820          (req == null ? null : req.getPoolName()) + "}";
7821      logAuditEvent(success, "modifyCachePool", poolNameStr,
7822                    req == null ? null : req.toString(), null);
7823    }
7824
7825    getEditLog().logSync();
7826  }
7827
7828  void removeCachePool(String cachePoolName, boolean logRetryCache)
7829      throws IOException {
7830    checkOperation(OperationCategory.WRITE);
7831    writeLock();
7832    boolean success = false;
7833    try {
7834      checkOperation(OperationCategory.WRITE);
7835      if (isInSafeMode()) {
7836        throw new SafeModeException(
7837            "Cannot remove cache pool " + cachePoolName, safeMode);
7838      }
7839      FSNDNCacheOp.removeCachePool(this, cacheManager, cachePoolName,
7840          logRetryCache);
7841      success = true;
7842    } finally {
7843      writeUnlock();
7844      String poolNameStr = "{poolName: " + cachePoolName + "}";
7845      logAuditEvent(success, "removeCachePool", poolNameStr, null, null);
7846    }
7847    
7848    getEditLog().logSync();
7849  }
7850
7851  BatchedListEntries<CachePoolEntry> listCachePools(String prevKey)
7852      throws IOException {
7853    BatchedListEntries<CachePoolEntry> results;
7854    checkOperation(OperationCategory.READ);
7855    boolean success = false;
7856    cacheManager.waitForRescanIfNeeded();
7857    readLock();
7858    try {
7859      checkOperation(OperationCategory.READ);
7860      results = FSNDNCacheOp.listCachePools(this, cacheManager, prevKey);
7861      success = true;
7862    } finally {
7863      readUnlock();
7864      logAuditEvent(success, "listCachePools", null, null, null);
7865    }
7866    return results;
7867  }
7868
7869  void modifyAclEntries(final String src, List<AclEntry> aclSpec)
7870      throws IOException {
7871    HdfsFileStatus auditStat = null;
7872    checkOperation(OperationCategory.WRITE);
7873    writeLock();
7874    try {
7875      checkOperation(OperationCategory.WRITE);
7876      checkNameNodeSafeMode("Cannot modify ACL entries on " + src);
7877      auditStat = FSDirAclOp.modifyAclEntries(dir, src, aclSpec);
7878    } catch (AccessControlException e) {
7879      logAuditEvent(false, "modifyAclEntries", src);
7880      throw e;
7881    } finally {
7882      writeUnlock();
7883    }
7884    getEditLog().logSync();
7885    logAuditEvent(true, "modifyAclEntries", src, null, auditStat);
7886  }
7887
7888  void removeAclEntries(final String src, List<AclEntry> aclSpec)
7889      throws IOException {
7890    checkOperation(OperationCategory.WRITE);
7891    HdfsFileStatus auditStat = null;
7892    writeLock();
7893    try {
7894      checkOperation(OperationCategory.WRITE);
7895      checkNameNodeSafeMode("Cannot remove ACL entries on " + src);
7896      auditStat = FSDirAclOp.removeAclEntries(dir, src, aclSpec);
7897    } catch (AccessControlException e) {
7898      logAuditEvent(false, "removeAclEntries", src);
7899      throw e;
7900    } finally {
7901      writeUnlock();
7902    }
7903    getEditLog().logSync();
7904    logAuditEvent(true, "removeAclEntries", src, null, auditStat);
7905  }
7906
7907  void removeDefaultAcl(final String src) throws IOException {
7908    HdfsFileStatus auditStat = null;
7909    checkOperation(OperationCategory.WRITE);
7910    writeLock();
7911    try {
7912      checkOperation(OperationCategory.WRITE);
7913      checkNameNodeSafeMode("Cannot remove default ACL entries on " + src);
7914      auditStat = FSDirAclOp.removeDefaultAcl(dir, src);
7915    } catch (AccessControlException e) {
7916      logAuditEvent(false, "removeDefaultAcl", src);
7917      throw e;
7918    } finally {
7919      writeUnlock();
7920    }
7921    getEditLog().logSync();
7922    logAuditEvent(true, "removeDefaultAcl", src, null, auditStat);
7923  }
7924
7925  void removeAcl(final String src) throws IOException {
7926    HdfsFileStatus auditStat = null;
7927    checkOperation(OperationCategory.WRITE);
7928    writeLock();
7929    try {
7930      checkOperation(OperationCategory.WRITE);
7931      checkNameNodeSafeMode("Cannot remove ACL on " + src);
7932      auditStat = FSDirAclOp.removeAcl(dir, src);
7933    } catch (AccessControlException e) {
7934      logAuditEvent(false, "removeAcl", src);
7935      throw e;
7936    } finally {
7937      writeUnlock();
7938    }
7939    getEditLog().logSync();
7940    logAuditEvent(true, "removeAcl", src, null, auditStat);
7941  }
7942
7943  void setAcl(final String src, List<AclEntry> aclSpec) throws IOException {
7944    HdfsFileStatus auditStat = null;
7945    checkOperation(OperationCategory.WRITE);
7946    writeLock();
7947    try {
7948      checkOperation(OperationCategory.WRITE);
7949      checkNameNodeSafeMode("Cannot set ACL on " + src);
7950      auditStat = FSDirAclOp.setAcl(dir, src, aclSpec);
7951    } catch (AccessControlException e) {
7952      logAuditEvent(false, "setAcl", src);
7953      throw e;
7954    } finally {
7955      writeUnlock();
7956    }
7957    getEditLog().logSync();
7958    logAuditEvent(true, "setAcl", src, null, auditStat);
7959  }
7960
7961  AclStatus getAclStatus(String src) throws IOException {
7962    checkOperation(OperationCategory.READ);
7963    boolean success = false;
7964    readLock();
7965    try {
7966      checkOperation(OperationCategory.READ);
7967      final AclStatus ret = FSDirAclOp.getAclStatus(dir, src);
7968      success = true;
7969      return ret;
7970    } finally {
7971      readUnlock();
7972      logAuditEvent(success, "getAclStatus", src);
7973    }
7974  }
7975
7976  /**
7977   * Create an encryption zone on directory src using the specified key.
7978   *
7979   * @param src     the path of a directory which will be the root of the
7980   *                encryption zone. The directory must be empty.
7981   * @param keyName name of a key which must be present in the configured
7982   *                KeyProvider.
7983   * @throws AccessControlException  if the caller is not the superuser.
7984   * @throws UnresolvedLinkException if the path can't be resolved.
7985   * @throws SafeModeException       if the Namenode is in safe mode.
7986   */
7987  void createEncryptionZone(final String src, final String keyName,
7988                            boolean logRetryCache)
7989    throws IOException, UnresolvedLinkException,
7990      SafeModeException, AccessControlException {
7991    try {
7992      if (provider == null) {
7993        throw new IOException(
7994            "Can't create an encryption zone for " + src +
7995            " since no key provider is available.");
7996      }
7997      if (keyName == null || keyName.isEmpty()) {
7998        throw new IOException("Must specify a key name when creating an " +
7999            "encryption zone");
8000      }
8001      KeyProvider.Metadata metadata = provider.getMetadata(keyName);
8002      if (metadata == null) {
8003        /*
8004         * It would be nice if we threw something more specific than
8005         * IOException when the key is not found, but the KeyProvider API
8006         * doesn't provide for that. If that API is ever changed to throw
8007         * something more specific (e.g. UnknownKeyException) then we can
8008         * update this to match it, or better yet, just rethrow the
8009         * KeyProvider's exception.
8010         */
8011        throw new IOException("Key " + keyName + " doesn't exist.");
8012      }
8013      // If the provider supports pool for EDEKs, this will fill in the pool
8014      generateEncryptedDataEncryptionKey(keyName);
8015      createEncryptionZoneInt(src, metadata.getCipher(),
8016          keyName, logRetryCache);
8017    } catch (AccessControlException e) {
8018      logAuditEvent(false, "createEncryptionZone", src);
8019      throw e;
8020    }
8021  }
8022
8023  private void createEncryptionZoneInt(final String srcArg, String cipher,
8024      String keyName, final boolean logRetryCache) throws IOException {
8025    String src = srcArg;
8026    HdfsFileStatus resultingStat = null;
8027    checkSuperuserPrivilege();
8028    checkOperation(OperationCategory.WRITE);
8029    final byte[][] pathComponents =
8030      FSDirectory.getPathComponentsForReservedPath(src);
8031    FSPermissionChecker pc = getPermissionChecker();
8032    writeLock();
8033    try {
8034      checkSuperuserPrivilege();
8035      checkOperation(OperationCategory.WRITE);
8036      checkNameNodeSafeMode("Cannot create encryption zone on " + src);
8037      src = dir.resolvePath(pc, src, pathComponents);
8038
8039      final CipherSuite suite = CipherSuite.convert(cipher);
8040      // For now this is hardcoded, as we only support one method.
8041      final CryptoProtocolVersion version =
8042          CryptoProtocolVersion.ENCRYPTION_ZONES;
8043      final XAttr ezXAttr = dir.createEncryptionZone(src, suite,
8044          version, keyName);
8045      List<XAttr> xAttrs = Lists.newArrayListWithCapacity(1);
8046      xAttrs.add(ezXAttr);
8047      getEditLog().logSetXAttrs(src, xAttrs, logRetryCache);
8048      final INodesInPath iip = dir.getINodesInPath4Write(src, false);
8049      resultingStat = dir.getAuditFileInfo(iip);
8050    } finally {
8051      writeUnlock();
8052    }
8053    getEditLog().logSync();
8054    logAuditEvent(true, "createEncryptionZone", srcArg, null, resultingStat);
8055  }
8056
8057  /**
8058   * Get the encryption zone for the specified path.
8059   *
8060   * @param srcArg the path of a file or directory to get the EZ for.
8061   * @return the EZ of the of the path or null if none.
8062   * @throws AccessControlException  if the caller is not the superuser.
8063   * @throws UnresolvedLinkException if the path can't be resolved.
8064   */
8065  EncryptionZone getEZForPath(final String srcArg)
8066    throws AccessControlException, UnresolvedLinkException, IOException {
8067    String src = srcArg;
8068    HdfsFileStatus resultingStat = null;
8069    final byte[][] pathComponents =
8070        FSDirectory.getPathComponentsForReservedPath(src);
8071    boolean success = false;
8072    final FSPermissionChecker pc = getPermissionChecker();
8073    checkOperation(OperationCategory.READ);
8074    readLock();
8075    try {
8076      checkOperation(OperationCategory.READ);
8077      src = dir.resolvePath(pc, src, pathComponents);
8078      final INodesInPath iip = dir.getINodesInPath(src, true);
8079      if (isPermissionEnabled) {
8080        dir.checkPathAccess(pc, iip, FsAction.READ);
8081      }
8082      final EncryptionZone ret = dir.getEZForPath(iip);
8083      resultingStat = dir.getAuditFileInfo(iip);
8084      success = true;
8085      return ret;
8086    } finally {
8087      readUnlock();
8088      logAuditEvent(success, "getEZForPath", srcArg, null, resultingStat);
8089    }
8090  }
8091
8092  BatchedListEntries<EncryptionZone> listEncryptionZones(long prevId)
8093      throws IOException {
8094    boolean success = false;
8095    checkSuperuserPrivilege();
8096    checkOperation(OperationCategory.READ);
8097    readLock();
8098    try {
8099      checkSuperuserPrivilege();
8100      checkOperation(OperationCategory.READ);
8101      final BatchedListEntries<EncryptionZone> ret =
8102          dir.listEncryptionZones(prevId);
8103      success = true;
8104      return ret;
8105    } finally {
8106      readUnlock();
8107      logAuditEvent(success, "listEncryptionZones", null);
8108    }
8109  }
8110
8111  void setXAttr(String src, XAttr xAttr, EnumSet<XAttrSetFlag> flag,
8112                boolean logRetryCache)
8113      throws IOException {
8114    checkOperation(OperationCategory.WRITE);
8115    HdfsFileStatus auditStat = null;
8116    writeLock();
8117    try {
8118      checkOperation(OperationCategory.WRITE);
8119      checkNameNodeSafeMode("Cannot set XAttr on " + src);
8120      auditStat = FSDirXAttrOp.setXAttr(dir, src, xAttr, flag, logRetryCache);
8121    } catch (AccessControlException e) {
8122      logAuditEvent(false, "setXAttr", src);
8123      throw e;
8124    } finally {
8125      writeUnlock();
8126    }
8127    getEditLog().logSync();
8128    logAuditEvent(true, "setXAttr", src, null, auditStat);
8129  }
8130
8131  List<XAttr> getXAttrs(final String src, List<XAttr> xAttrs)
8132      throws IOException {
8133    checkOperation(OperationCategory.READ);
8134    readLock();
8135    try {
8136      checkOperation(OperationCategory.READ);
8137      return FSDirXAttrOp.getXAttrs(dir, src, xAttrs);
8138    } catch (AccessControlException e) {
8139      logAuditEvent(false, "getXAttrs", src);
8140      throw e;
8141    } finally {
8142      readUnlock();
8143    }
8144  }
8145
8146  List<XAttr> listXAttrs(String src) throws IOException {
8147    checkOperation(OperationCategory.READ);
8148    readLock();
8149    try {
8150      checkOperation(OperationCategory.READ);
8151      return FSDirXAttrOp.listXAttrs(dir, src);
8152    } catch (AccessControlException e) {
8153      logAuditEvent(false, "listXAttrs", src);
8154      throw e;
8155    } finally {
8156      readUnlock();
8157    }
8158  }
8159
8160  void removeXAttr(String src, XAttr xAttr, boolean logRetryCache)
8161      throws IOException {
8162    checkOperation(OperationCategory.WRITE);
8163    HdfsFileStatus auditStat = null;
8164    writeLock();
8165    try {
8166      checkOperation(OperationCategory.WRITE);
8167      checkNameNodeSafeMode("Cannot remove XAttr entry on " + src);
8168      auditStat = FSDirXAttrOp.removeXAttr(dir, src, xAttr, logRetryCache);
8169    } catch (AccessControlException e) {
8170      logAuditEvent(false, "removeXAttr", src);
8171      throw e;
8172    } finally {
8173      writeUnlock();
8174    }
8175    getEditLog().logSync();
8176    logAuditEvent(true, "removeXAttr", src, null, auditStat);
8177  }
8178
8179  void checkAccess(String src, FsAction mode) throws IOException {
8180    checkOperation(OperationCategory.READ);
8181    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
8182    readLock();
8183    try {
8184      checkOperation(OperationCategory.READ);
8185      src = FSDirectory.resolvePath(src, pathComponents, dir);
8186      final INodesInPath iip = dir.getINodesInPath(src, true);
8187      INode inode = iip.getLastINode();
8188      if (inode == null) {
8189        throw new FileNotFoundException("Path not found");
8190      }
8191      if (isPermissionEnabled) {
8192        FSPermissionChecker pc = getPermissionChecker();
8193        dir.checkPathAccess(pc, iip, mode);
8194      }
8195    } catch (AccessControlException e) {
8196      logAuditEvent(false, "checkAccess", src);
8197      throw e;
8198    } finally {
8199      readUnlock();
8200    }
8201  }
8202
8203  /**
8204   * Default AuditLogger implementation; used when no access logger is
8205   * defined in the config file. It can also be explicitly listed in the
8206   * config file.
8207   */
8208  private static class DefaultAuditLogger extends HdfsAuditLogger {
8209
8210    private boolean logTokenTrackingId;
8211
8212    @Override
8213    public void initialize(Configuration conf) {
8214      logTokenTrackingId = conf.getBoolean(
8215          DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY,
8216          DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT);
8217    }
8218
8219    @Override
8220    public void logAuditEvent(boolean succeeded, String userName,
8221        InetAddress addr, String cmd, String src, String dst,
8222        FileStatus status, UserGroupInformation ugi,
8223        DelegationTokenSecretManager dtSecretManager) {
8224      if (auditLog.isInfoEnabled()) {
8225        final StringBuilder sb = auditBuffer.get();
8226        sb.setLength(0);
8227        sb.append("allowed=").append(succeeded).append("\t");
8228        sb.append("ugi=").append(userName).append("\t");
8229        sb.append("ip=").append(addr).append("\t");
8230        sb.append("cmd=").append(cmd).append("\t");
8231        sb.append("src=").append(src).append("\t");
8232        sb.append("dst=").append(dst).append("\t");
8233        if (null == status) {
8234          sb.append("perm=null");
8235        } else {
8236          sb.append("perm=");
8237          sb.append(status.getOwner()).append(":");
8238          sb.append(status.getGroup()).append(":");
8239          sb.append(status.getPermission());
8240        }
8241        if (logTokenTrackingId) {
8242          sb.append("\t").append("trackingId=");
8243          String trackingId = null;
8244          if (ugi != null && dtSecretManager != null
8245              && ugi.getAuthenticationMethod() == AuthenticationMethod.TOKEN) {
8246            for (TokenIdentifier tid: ugi.getTokenIdentifiers()) {
8247              if (tid instanceof DelegationTokenIdentifier) {
8248                DelegationTokenIdentifier dtid =
8249                    (DelegationTokenIdentifier)tid;
8250                trackingId = dtSecretManager.getTokenTrackingId(dtid);
8251                break;
8252              }
8253            }
8254          }
8255          sb.append(trackingId);
8256        }
8257        sb.append("\t").append("proto=");
8258        sb.append(NamenodeWebHdfsMethods.isWebHdfsInvocation() ? "webhdfs" : "rpc");
8259        logAuditMessage(sb.toString());
8260      }
8261    }
8262
8263    public void logAuditMessage(String message) {
8264      auditLog.info(message);
8265    }
8266  }
8267
8268  private static void enableAsyncAuditLog() {
8269    if (!(auditLog instanceof Log4JLogger)) {
8270      LOG.warn("Log4j is required to enable async auditlog");
8271      return;
8272    }
8273    Logger logger = ((Log4JLogger)auditLog).getLogger();
8274    @SuppressWarnings("unchecked")
8275    List<Appender> appenders = Collections.list(logger.getAllAppenders());
8276    // failsafe against trying to async it more than once
8277    if (!appenders.isEmpty() && !(appenders.get(0) instanceof AsyncAppender)) {
8278      AsyncAppender asyncAppender = new AsyncAppender();
8279      // change logger to have an async appender containing all the
8280      // previously configured appenders
8281      for (Appender appender : appenders) {
8282        logger.removeAppender(appender);
8283        asyncAppender.addAppender(appender);
8284      }
8285      logger.addAppender(asyncAppender);        
8286    }
8287  }
8288
8289}
8290