001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.server.namenode; 019 020import static org.apache.hadoop.crypto.key.KeyProviderCryptoExtension.EncryptedKeyVersion; 021import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_DEFAULT; 022import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_KEY; 023import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_DEFAULT; 024import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_KEY; 025import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_DEFAULT; 026import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_KEY; 027import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_DEFAULT; 028import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY; 029import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CHECKSUM_TYPE_DEFAULT; 030import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CHECKSUM_TYPE_KEY; 031import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_WRITE_PACKET_SIZE_DEFAULT; 032import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_WRITE_PACKET_SIZE_KEY; 033import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_DEFAULT; 034import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_KEY; 035import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_DEFAULT; 036import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_KEY; 037import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT; 038import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_KEY; 039import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOGGERS_KEY; 040import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_ASYNC_DEFAULT; 041import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_ASYNC_KEY; 042import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT; 043import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY; 044import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT; 045import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY; 046import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME; 047import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT; 048import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY; 049import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT; 050import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY; 051import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT; 052import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY; 053import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT; 054import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY; 055import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY; 056import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY; 057import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS; 058import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS_DEFAULT; 059import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD; 060import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD_DEFAULT; 061import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_DEFAULT; 062import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY; 063import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC; 064import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC_DEFAULT; 065import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_INODE_ATTRIBUTES_PROVIDER_KEY; 066import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_DEFAULT; 067import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_KEY; 068import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY; 069import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_DEFAULT; 070import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY; 071import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY; 072import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT; 073import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY; 074import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_DEFAULT; 075import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_KEY; 076import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_DEFAULT; 077import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_KEY; 078import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY; 079import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT; 080import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY; 081import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT; 082import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY; 083import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY; 084import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_DEFAULT; 085import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY; 086import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT; 087import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_KEY; 088import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_DEFAULT; 089import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_KEY; 090import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SUPPORT_APPEND_DEFAULT; 091import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SUPPORT_APPEND_KEY; 092import static org.apache.hadoop.hdfs.server.common.HdfsServerConstants.SECURITY_XATTR_UNREADABLE_BY_SUPERUSER; 093import static org.apache.hadoop.util.Time.now; 094import static org.apache.hadoop.util.Time.monotonicNow; 095 096import java.io.BufferedWriter; 097import java.io.ByteArrayInputStream; 098import java.io.DataInput; 099import java.io.DataInputStream; 100import java.io.DataOutputStream; 101import java.io.File; 102import java.io.FileNotFoundException; 103import java.io.FileOutputStream; 104import java.io.IOException; 105import java.io.OutputStreamWriter; 106import java.io.PrintWriter; 107import java.io.StringWriter; 108import java.lang.management.ManagementFactory; 109import java.net.InetAddress; 110import java.net.URI; 111import java.security.GeneralSecurityException; 112import java.util.ArrayList; 113import java.util.Arrays; 114import java.util.Collection; 115import java.util.Collections; 116import java.util.Date; 117import java.util.EnumSet; 118import java.util.HashMap; 119import java.util.HashSet; 120import java.util.Iterator; 121import java.util.LinkedHashSet; 122import java.util.List; 123import java.util.Map; 124import java.util.Set; 125import java.util.TreeMap; 126import java.util.concurrent.TimeUnit; 127import java.util.concurrent.locks.Condition; 128import java.util.concurrent.locks.ReentrantLock; 129import java.util.concurrent.locks.ReentrantReadWriteLock; 130 131import javax.management.NotCompliantMBeanException; 132import javax.management.ObjectName; 133import javax.management.StandardMBean; 134 135import org.apache.commons.logging.Log; 136import org.apache.commons.logging.LogFactory; 137import org.apache.commons.logging.impl.Log4JLogger; 138import org.apache.hadoop.HadoopIllegalArgumentException; 139import org.apache.hadoop.classification.InterfaceAudience; 140import org.apache.hadoop.conf.Configuration; 141import org.apache.hadoop.crypto.CipherSuite; 142import org.apache.hadoop.crypto.CryptoProtocolVersion; 143import org.apache.hadoop.crypto.key.KeyProvider; 144import org.apache.hadoop.crypto.CryptoCodec; 145import org.apache.hadoop.crypto.key.KeyProviderCryptoExtension; 146import org.apache.hadoop.fs.BatchedRemoteIterator.BatchedListEntries; 147import org.apache.hadoop.fs.CacheFlag; 148import org.apache.hadoop.fs.ContentSummary; 149import org.apache.hadoop.fs.CreateFlag; 150import org.apache.hadoop.fs.FileAlreadyExistsException; 151import org.apache.hadoop.fs.FileEncryptionInfo; 152import org.apache.hadoop.fs.FileStatus; 153import org.apache.hadoop.fs.FileSystem; 154import org.apache.hadoop.fs.FsServerDefaults; 155import org.apache.hadoop.fs.InvalidPathException; 156import org.apache.hadoop.fs.Options; 157import org.apache.hadoop.fs.ParentNotDirectoryException; 158import org.apache.hadoop.fs.Path; 159import org.apache.hadoop.fs.UnresolvedLinkException; 160import org.apache.hadoop.fs.XAttr; 161import org.apache.hadoop.fs.XAttrSetFlag; 162import org.apache.hadoop.fs.permission.AclEntry; 163import org.apache.hadoop.fs.permission.AclStatus; 164import org.apache.hadoop.fs.permission.FsAction; 165import org.apache.hadoop.fs.permission.FsPermission; 166import org.apache.hadoop.fs.permission.PermissionStatus; 167import org.apache.hadoop.fs.StorageType; 168import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState; 169import org.apache.hadoop.ha.ServiceFailedException; 170import org.apache.hadoop.hdfs.protocol.BlockStoragePolicy; 171import org.apache.hadoop.hdfs.DFSConfigKeys; 172import org.apache.hadoop.hdfs.DFSUtil; 173import org.apache.hadoop.hdfs.HAUtil; 174import org.apache.hadoop.hdfs.HdfsConfiguration; 175import org.apache.hadoop.hdfs.UnknownCryptoProtocolVersionException; 176import org.apache.hadoop.hdfs.XAttrHelper; 177import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException; 178import org.apache.hadoop.hdfs.protocol.Block; 179import org.apache.hadoop.hdfs.protocol.CacheDirectiveEntry; 180import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo; 181import org.apache.hadoop.hdfs.protocol.CachePoolEntry; 182import org.apache.hadoop.hdfs.protocol.CachePoolInfo; 183import org.apache.hadoop.hdfs.protocol.ClientProtocol; 184import org.apache.hadoop.hdfs.protocol.DatanodeID; 185import org.apache.hadoop.hdfs.protocol.DatanodeInfo; 186import org.apache.hadoop.hdfs.protocol.DirectoryListing; 187import org.apache.hadoop.hdfs.protocol.EncryptionZone; 188import org.apache.hadoop.hdfs.protocol.ExtendedBlock; 189import org.apache.hadoop.hdfs.protocol.HdfsConstants; 190import org.apache.hadoop.hdfs.protocol.LastBlockWithStatus; 191import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType; 192import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction; 193import org.apache.hadoop.hdfs.protocol.HdfsFileStatus; 194import org.apache.hadoop.hdfs.protocol.LocatedBlock; 195import org.apache.hadoop.hdfs.protocol.LocatedBlocks; 196import org.apache.hadoop.hdfs.protocol.QuotaExceededException; 197import org.apache.hadoop.hdfs.protocol.RecoveryInProgressException; 198import org.apache.hadoop.hdfs.protocol.RollingUpgradeException; 199import org.apache.hadoop.hdfs.protocol.RollingUpgradeInfo; 200import org.apache.hadoop.hdfs.protocol.SnapshotAccessControlException; 201import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport; 202import org.apache.hadoop.hdfs.protocol.SnapshottableDirectoryStatus; 203import org.apache.hadoop.hdfs.protocol.datatransfer.ReplaceDatanodeOnFailure; 204import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager; 205import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager.AccessMode; 206import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier; 207import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager; 208import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager.SecretManagerState; 209import org.apache.hadoop.hdfs.server.blockmanagement.BlockCollection; 210import org.apache.hadoop.hdfs.server.blockmanagement.BlockIdManager; 211import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoContiguous; 212import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoContiguousUnderConstruction; 213import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager; 214import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor; 215import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager; 216import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStatistics; 217import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo; 218import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState; 219import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole; 220import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.RollingUpgradeStartupOption; 221import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption; 222import org.apache.hadoop.hdfs.server.common.Storage; 223import org.apache.hadoop.hdfs.server.common.Storage.StorageDirType; 224import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory; 225import org.apache.hadoop.hdfs.server.common.Util; 226import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SecretManagerSection; 227import org.apache.hadoop.hdfs.server.namenode.INode.BlocksMapUpdateInfo; 228import org.apache.hadoop.hdfs.server.namenode.JournalSet.JournalAndStream; 229import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease; 230import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile; 231import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory; 232import org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer; 233import org.apache.hadoop.hdfs.server.namenode.ha.HAContext; 234import org.apache.hadoop.hdfs.server.namenode.ha.StandbyCheckpointer; 235import org.apache.hadoop.hdfs.server.namenode.metrics.FSNamesystemMBean; 236import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics; 237import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot; 238import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotManager; 239import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase; 240import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress; 241import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter; 242import org.apache.hadoop.hdfs.server.namenode.startupprogress.Status; 243import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step; 244import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType; 245import org.apache.hadoop.hdfs.server.namenode.top.TopAuditLogger; 246import org.apache.hadoop.hdfs.server.namenode.top.TopConf; 247import org.apache.hadoop.hdfs.server.namenode.top.metrics.TopMetrics; 248import org.apache.hadoop.hdfs.server.namenode.top.window.RollingWindowManager; 249import org.apache.hadoop.hdfs.server.namenode.web.resources.NamenodeWebHdfsMethods; 250import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand; 251import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration; 252import org.apache.hadoop.hdfs.server.protocol.DatanodeStorageReport; 253import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse; 254import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat; 255import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand; 256import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration; 257import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; 258import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks; 259import org.apache.hadoop.hdfs.server.protocol.StorageReport; 260import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary; 261import org.apache.hadoop.io.EnumSetWritable; 262import org.apache.hadoop.io.IOUtils; 263import org.apache.hadoop.io.Text; 264import org.apache.hadoop.ipc.RetriableException; 265import org.apache.hadoop.ipc.RetryCache; 266import org.apache.hadoop.ipc.Server; 267import org.apache.hadoop.ipc.StandbyException; 268import org.apache.hadoop.metrics2.annotation.Metric; 269import org.apache.hadoop.metrics2.annotation.Metrics; 270import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; 271import org.apache.hadoop.metrics2.util.MBeans; 272import org.apache.hadoop.net.NetworkTopology; 273import org.apache.hadoop.net.Node; 274import org.apache.hadoop.net.NodeBase; 275import org.apache.hadoop.security.AccessControlException; 276import org.apache.hadoop.security.UserGroupInformation; 277import org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod; 278import org.apache.hadoop.security.token.SecretManager.InvalidToken; 279import org.apache.hadoop.security.token.Token; 280import org.apache.hadoop.security.token.TokenIdentifier; 281import org.apache.hadoop.security.token.delegation.DelegationKey; 282import org.apache.hadoop.util.ChunkedArrayList; 283import org.apache.hadoop.util.Daemon; 284import org.apache.hadoop.util.DataChecksum; 285import org.apache.hadoop.util.ReflectionUtils; 286import org.apache.hadoop.util.StringUtils; 287import org.apache.hadoop.util.VersionInfo; 288import org.apache.log4j.Appender; 289import org.apache.log4j.AsyncAppender; 290import org.apache.log4j.Logger; 291import org.codehaus.jackson.map.ObjectMapper; 292import org.mortbay.util.ajax.JSON; 293 294import com.google.common.annotations.VisibleForTesting; 295import com.google.common.base.Charsets; 296import com.google.common.base.Preconditions; 297import com.google.common.collect.ImmutableMap; 298import com.google.common.collect.Lists; 299 300/*************************************************** 301 * FSNamesystem does the actual bookkeeping work for the 302 * DataNode. 303 * 304 * It tracks several important tables. 305 * 306 * 1) valid fsname --> blocklist (kept on disk, logged) 307 * 2) Set of all valid blocks (inverted #1) 308 * 3) block --> machinelist (kept in memory, rebuilt dynamically from reports) 309 * 4) machine --> blocklist (inverted #2) 310 * 5) LRU cache of updated-heartbeat machines 311 ***************************************************/ 312@InterfaceAudience.Private 313@Metrics(context="dfs") 314public class FSNamesystem implements Namesystem, FSNamesystemMBean, 315 NameNodeMXBean { 316 public static final Log LOG = LogFactory.getLog(FSNamesystem.class); 317 318 private static final ThreadLocal<StringBuilder> auditBuffer = 319 new ThreadLocal<StringBuilder>() { 320 @Override 321 protected StringBuilder initialValue() { 322 return new StringBuilder(); 323 } 324 }; 325 326 private final BlockIdManager blockIdManager; 327 328 @VisibleForTesting 329 public boolean isAuditEnabled() { 330 return !isDefaultAuditLogger || auditLog.isInfoEnabled(); 331 } 332 333 private void logAuditEvent(boolean succeeded, String cmd, String src) 334 throws IOException { 335 logAuditEvent(succeeded, cmd, src, null, null); 336 } 337 338 private void logAuditEvent(boolean succeeded, String cmd, String src, 339 String dst, HdfsFileStatus stat) throws IOException { 340 if (isAuditEnabled() && isExternalInvocation()) { 341 logAuditEvent(succeeded, getRemoteUser(), getRemoteIp(), 342 cmd, src, dst, stat); 343 } 344 } 345 346 private void logAuditEvent(boolean succeeded, 347 UserGroupInformation ugi, InetAddress addr, String cmd, String src, 348 String dst, HdfsFileStatus stat) { 349 FileStatus status = null; 350 if (stat != null) { 351 Path symlink = stat.isSymlink() ? new Path(stat.getSymlink()) : null; 352 Path path = dst != null ? new Path(dst) : new Path(src); 353 status = new FileStatus(stat.getLen(), stat.isDir(), 354 stat.getReplication(), stat.getBlockSize(), stat.getModificationTime(), 355 stat.getAccessTime(), stat.getPermission(), stat.getOwner(), 356 stat.getGroup(), symlink, path); 357 } 358 for (AuditLogger logger : auditLoggers) { 359 if (logger instanceof HdfsAuditLogger) { 360 HdfsAuditLogger hdfsLogger = (HdfsAuditLogger) logger; 361 hdfsLogger.logAuditEvent(succeeded, ugi.toString(), addr, cmd, src, dst, 362 status, ugi, dtSecretManager); 363 } else { 364 logger.logAuditEvent(succeeded, ugi.toString(), addr, 365 cmd, src, dst, status); 366 } 367 } 368 } 369 370 /** 371 * Logger for audit events, noting successful FSNamesystem operations. Emits 372 * to FSNamesystem.audit at INFO. Each event causes a set of tab-separated 373 * <code>key=value</code> pairs to be written for the following properties: 374 * <code> 375 * ugi=<ugi in RPC> 376 * ip=<remote IP> 377 * cmd=<command> 378 * src=<src path> 379 * dst=<dst path (optional)> 380 * perm=<permissions (optional)> 381 * </code> 382 */ 383 public static final Log auditLog = LogFactory.getLog( 384 FSNamesystem.class.getName() + ".audit"); 385 386 static final int DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED = 100; 387 static int BLOCK_DELETION_INCREMENT = 1000; 388 private final boolean isPermissionEnabled; 389 private final UserGroupInformation fsOwner; 390 private final String supergroup; 391 private final boolean standbyShouldCheckpoint; 392 393 // Scan interval is not configurable. 394 private static final long DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL = 395 TimeUnit.MILLISECONDS.convert(1, TimeUnit.HOURS); 396 final DelegationTokenSecretManager dtSecretManager; 397 private final boolean alwaysUseDelegationTokensForTests; 398 399 private static final Step STEP_AWAITING_REPORTED_BLOCKS = 400 new Step(StepType.AWAITING_REPORTED_BLOCKS); 401 402 // Tracks whether the default audit logger is the only configured audit 403 // logger; this allows isAuditEnabled() to return false in case the 404 // underlying logger is disabled, and avoid some unnecessary work. 405 private final boolean isDefaultAuditLogger; 406 private final List<AuditLogger> auditLoggers; 407 408 /** The namespace tree. */ 409 FSDirectory dir; 410 private final BlockManager blockManager; 411 private final SnapshotManager snapshotManager; 412 private final CacheManager cacheManager; 413 private final DatanodeStatistics datanodeStatistics; 414 415 private String nameserviceId; 416 417 private volatile RollingUpgradeInfo rollingUpgradeInfo = null; 418 /** 419 * A flag that indicates whether the checkpointer should checkpoint a rollback 420 * fsimage. The edit log tailer sets this flag. The checkpoint will create a 421 * rollback fsimage if the flag is true, and then change the flag to false. 422 */ 423 private volatile boolean needRollbackFsImage; 424 425 // Block pool ID used by this namenode 426 private String blockPoolId; 427 428 final LeaseManager leaseManager = new LeaseManager(this); 429 430 volatile Daemon smmthread = null; // SafeModeMonitor thread 431 432 Daemon nnrmthread = null; // NamenodeResourceMonitor thread 433 434 Daemon nnEditLogRoller = null; // NameNodeEditLogRoller thread 435 436 // A daemon to periodically clean up corrupt lazyPersist files 437 // from the name space. 438 Daemon lazyPersistFileScrubber = null; 439 /** 440 * When an active namenode will roll its own edit log, in # edits 441 */ 442 private final long editLogRollerThreshold; 443 /** 444 * Check interval of an active namenode's edit log roller thread 445 */ 446 private final int editLogRollerInterval; 447 448 /** 449 * How frequently we scan and unlink corrupt lazyPersist files. 450 * (In seconds) 451 */ 452 private final int lazyPersistFileScrubIntervalSec; 453 454 private volatile boolean hasResourcesAvailable = false; 455 private volatile boolean fsRunning = true; 456 457 /** The start time of the namesystem. */ 458 private final long startTime = now(); 459 460 /** The interval of namenode checking for the disk space availability */ 461 private final long resourceRecheckInterval; 462 463 // The actual resource checker instance. 464 NameNodeResourceChecker nnResourceChecker; 465 466 private final FsServerDefaults serverDefaults; 467 private final boolean supportAppends; 468 private final ReplaceDatanodeOnFailure dtpReplaceDatanodeOnFailure; 469 470 private volatile SafeModeInfo safeMode; // safe mode information 471 472 private final long maxFsObjects; // maximum number of fs objects 473 474 private final long minBlockSize; // minimum block size 475 private final long maxBlocksPerFile; // maximum # of blocks per file 476 477 // precision of access times. 478 private final long accessTimePrecision; 479 480 /** Lock to protect FSNamesystem. */ 481 private final FSNamesystemLock fsLock; 482 483 /** 484 * Checkpoint lock to protect FSNamesystem modification on standby NNs. 485 * Unlike fsLock, it does not affect block updates. On active NNs, this lock 486 * does not provide proper protection, because there are operations that 487 * modify both block and name system state. Even on standby, fsLock is 488 * used when block state changes need to be blocked. 489 */ 490 private final ReentrantLock cpLock; 491 492 /** 493 * Used when this NN is in standby state to read from the shared edit log. 494 */ 495 private EditLogTailer editLogTailer = null; 496 497 /** 498 * Used when this NN is in standby state to perform checkpoints. 499 */ 500 private StandbyCheckpointer standbyCheckpointer; 501 502 /** 503 * Reference to the NN's HAContext object. This is only set once 504 * {@link #startCommonServices(Configuration, HAContext)} is called. 505 */ 506 private HAContext haContext; 507 508 private final boolean haEnabled; 509 510 /** flag indicating whether replication queues have been initialized */ 511 boolean initializedReplQueues = false; 512 513 /** 514 * Whether the namenode is in the middle of starting the active service 515 */ 516 private volatile boolean startingActiveService = false; 517 518 private final RetryCache retryCache; 519 520 private KeyProviderCryptoExtension provider = null; 521 522 private volatile boolean imageLoaded = false; 523 private final Condition cond; 524 525 private final FSImage fsImage; 526 527 private final TopConf topConf; 528 private TopMetrics topMetrics; 529 530 private INodeAttributeProvider inodeAttributeProvider; 531 532 /** 533 * Notify that loading of this FSDirectory is complete, and 534 * it is imageLoaded for use 535 */ 536 void imageLoadComplete() { 537 Preconditions.checkState(!imageLoaded, "FSDirectory already loaded"); 538 setImageLoaded(); 539 } 540 541 void setImageLoaded() { 542 if(imageLoaded) return; 543 writeLock(); 544 try { 545 setImageLoaded(true); 546 dir.markNameCacheInitialized(); 547 cond.signalAll(); 548 } finally { 549 writeUnlock(); 550 } 551 } 552 553 //This is for testing purposes only 554 @VisibleForTesting 555 boolean isImageLoaded() { 556 return imageLoaded; 557 } 558 559 // exposed for unit tests 560 protected void setImageLoaded(boolean flag) { 561 imageLoaded = flag; 562 } 563 564 /** 565 * Block until the object is imageLoaded to be used. 566 */ 567 void waitForLoadingFSImage() { 568 if (!imageLoaded) { 569 writeLock(); 570 try { 571 while (!imageLoaded) { 572 try { 573 cond.await(5000, TimeUnit.MILLISECONDS); 574 } catch (InterruptedException ignored) { 575 } 576 } 577 } finally { 578 writeUnlock(); 579 } 580 } 581 } 582 583 /** 584 * Clear all loaded data 585 */ 586 void clear() { 587 dir.reset(); 588 dtSecretManager.reset(); 589 blockIdManager.clear(); 590 leaseManager.removeAllLeases(); 591 snapshotManager.clearSnapshottableDirs(); 592 cacheManager.clear(); 593 setImageLoaded(false); 594 blockManager.clear(); 595 } 596 597 @VisibleForTesting 598 LeaseManager getLeaseManager() { 599 return leaseManager; 600 } 601 602 boolean isHaEnabled() { 603 return haEnabled; 604 } 605 606 /** 607 * Check the supplied configuration for correctness. 608 * @param conf Supplies the configuration to validate. 609 * @throws IOException if the configuration could not be queried. 610 * @throws IllegalArgumentException if the configuration is invalid. 611 */ 612 private static void checkConfiguration(Configuration conf) 613 throws IOException { 614 615 final Collection<URI> namespaceDirs = 616 FSNamesystem.getNamespaceDirs(conf); 617 final Collection<URI> editsDirs = 618 FSNamesystem.getNamespaceEditsDirs(conf); 619 final Collection<URI> requiredEditsDirs = 620 FSNamesystem.getRequiredNamespaceEditsDirs(conf); 621 final Collection<URI> sharedEditsDirs = 622 FSNamesystem.getSharedEditsDirs(conf); 623 624 for (URI u : requiredEditsDirs) { 625 if (u.toString().compareTo( 626 DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_DEFAULT) == 0) { 627 continue; 628 } 629 630 // Each required directory must also be in editsDirs or in 631 // sharedEditsDirs. 632 if (!editsDirs.contains(u) && 633 !sharedEditsDirs.contains(u)) { 634 throw new IllegalArgumentException( 635 "Required edits directory " + u.toString() + " not present in " + 636 DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY + ". " + 637 DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY + "=" + 638 editsDirs.toString() + "; " + 639 DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY + "=" + 640 requiredEditsDirs.toString() + ". " + 641 DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY + "=" + 642 sharedEditsDirs.toString() + "."); 643 } 644 } 645 646 if (namespaceDirs.size() == 1) { 647 LOG.warn("Only one image storage directory (" 648 + DFS_NAMENODE_NAME_DIR_KEY + ") configured. Beware of data loss" 649 + " due to lack of redundant storage directories!"); 650 } 651 if (editsDirs.size() == 1) { 652 LOG.warn("Only one namespace edits storage directory (" 653 + DFS_NAMENODE_EDITS_DIR_KEY + ") configured. Beware of data loss" 654 + " due to lack of redundant storage directories!"); 655 } 656 } 657 658 /** 659 * Instantiates an FSNamesystem loaded from the image and edits 660 * directories specified in the passed Configuration. 661 * 662 * @param conf the Configuration which specifies the storage directories 663 * from which to load 664 * @return an FSNamesystem which contains the loaded namespace 665 * @throws IOException if loading fails 666 */ 667 static FSNamesystem loadFromDisk(Configuration conf) throws IOException { 668 669 checkConfiguration(conf); 670 FSImage fsImage = new FSImage(conf, 671 FSNamesystem.getNamespaceDirs(conf), 672 FSNamesystem.getNamespaceEditsDirs(conf)); 673 FSNamesystem namesystem = new FSNamesystem(conf, fsImage, false); 674 StartupOption startOpt = NameNode.getStartupOption(conf); 675 if (startOpt == StartupOption.RECOVER) { 676 namesystem.setSafeMode(SafeModeAction.SAFEMODE_ENTER); 677 } 678 679 long loadStart = monotonicNow(); 680 try { 681 namesystem.loadFSImage(startOpt); 682 } catch (IOException ioe) { 683 LOG.warn("Encountered exception loading fsimage", ioe); 684 fsImage.close(); 685 throw ioe; 686 } 687 long timeTakenToLoadFSImage = monotonicNow() - loadStart; 688 LOG.info("Finished loading FSImage in " + timeTakenToLoadFSImage + " msecs"); 689 NameNodeMetrics nnMetrics = NameNode.getNameNodeMetrics(); 690 if (nnMetrics != null) { 691 nnMetrics.setFsImageLoadTime((int) timeTakenToLoadFSImage); 692 } 693 return namesystem; 694 } 695 696 FSNamesystem(Configuration conf, FSImage fsImage) throws IOException { 697 this(conf, fsImage, false); 698 } 699 700 /** 701 * Create an FSNamesystem associated with the specified image. 702 * 703 * Note that this does not load any data off of disk -- if you would 704 * like that behavior, use {@link #loadFromDisk(Configuration)} 705 * 706 * @param conf configuration 707 * @param fsImage The FSImage to associate with 708 * @param ignoreRetryCache Whether or not should ignore the retry cache setup 709 * step. For Secondary NN this should be set to true. 710 * @throws IOException on bad configuration 711 */ 712 FSNamesystem(Configuration conf, FSImage fsImage, boolean ignoreRetryCache) 713 throws IOException { 714 provider = DFSUtil.createKeyProviderCryptoExtension(conf); 715 if (provider == null) { 716 LOG.info("No KeyProvider found."); 717 } else { 718 LOG.info("Found KeyProvider: " + provider.toString()); 719 } 720 if (conf.getBoolean(DFS_NAMENODE_AUDIT_LOG_ASYNC_KEY, 721 DFS_NAMENODE_AUDIT_LOG_ASYNC_DEFAULT)) { 722 LOG.info("Enabling async auditlog"); 723 enableAsyncAuditLog(); 724 } 725 boolean fair = conf.getBoolean("dfs.namenode.fslock.fair", true); 726 LOG.info("fsLock is fair:" + fair); 727 fsLock = new FSNamesystemLock(fair); 728 cond = fsLock.writeLock().newCondition(); 729 cpLock = new ReentrantLock(); 730 731 this.fsImage = fsImage; 732 try { 733 resourceRecheckInterval = conf.getLong( 734 DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY, 735 DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT); 736 737 this.blockManager = new BlockManager(this, conf); 738 this.datanodeStatistics = blockManager.getDatanodeManager().getDatanodeStatistics(); 739 this.blockIdManager = new BlockIdManager(blockManager); 740 741 this.fsOwner = UserGroupInformation.getCurrentUser(); 742 this.supergroup = conf.get(DFS_PERMISSIONS_SUPERUSERGROUP_KEY, 743 DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT); 744 this.isPermissionEnabled = conf.getBoolean(DFS_PERMISSIONS_ENABLED_KEY, 745 DFS_PERMISSIONS_ENABLED_DEFAULT); 746 LOG.info("fsOwner = " + fsOwner); 747 LOG.info("supergroup = " + supergroup); 748 LOG.info("isPermissionEnabled = " + isPermissionEnabled); 749 750 // block allocation has to be persisted in HA using a shared edits directory 751 // so that the standby has up-to-date namespace information 752 nameserviceId = DFSUtil.getNamenodeNameServiceId(conf); 753 this.haEnabled = HAUtil.isHAEnabled(conf, nameserviceId); 754 755 // Sanity check the HA-related config. 756 if (nameserviceId != null) { 757 LOG.info("Determined nameservice ID: " + nameserviceId); 758 } 759 LOG.info("HA Enabled: " + haEnabled); 760 if (!haEnabled && HAUtil.usesSharedEditsDir(conf)) { 761 LOG.warn("Configured NNs:\n" + DFSUtil.nnAddressesAsString(conf)); 762 throw new IOException("Invalid configuration: a shared edits dir " + 763 "must not be specified if HA is not enabled."); 764 } 765 766 // Get the checksum type from config 767 String checksumTypeStr = conf.get(DFS_CHECKSUM_TYPE_KEY, DFS_CHECKSUM_TYPE_DEFAULT); 768 DataChecksum.Type checksumType; 769 try { 770 checksumType = DataChecksum.Type.valueOf(checksumTypeStr); 771 } catch (IllegalArgumentException iae) { 772 throw new IOException("Invalid checksum type in " 773 + DFS_CHECKSUM_TYPE_KEY + ": " + checksumTypeStr); 774 } 775 776 this.serverDefaults = new FsServerDefaults( 777 conf.getLongBytes(DFS_BLOCK_SIZE_KEY, DFS_BLOCK_SIZE_DEFAULT), 778 conf.getInt(DFS_BYTES_PER_CHECKSUM_KEY, DFS_BYTES_PER_CHECKSUM_DEFAULT), 779 conf.getInt(DFS_CLIENT_WRITE_PACKET_SIZE_KEY, DFS_CLIENT_WRITE_PACKET_SIZE_DEFAULT), 780 (short) conf.getInt(DFS_REPLICATION_KEY, DFS_REPLICATION_DEFAULT), 781 conf.getInt(IO_FILE_BUFFER_SIZE_KEY, IO_FILE_BUFFER_SIZE_DEFAULT), 782 conf.getBoolean(DFS_ENCRYPT_DATA_TRANSFER_KEY, DFS_ENCRYPT_DATA_TRANSFER_DEFAULT), 783 conf.getLong(FS_TRASH_INTERVAL_KEY, FS_TRASH_INTERVAL_DEFAULT), 784 checksumType); 785 786 this.maxFsObjects = conf.getLong(DFS_NAMENODE_MAX_OBJECTS_KEY, 787 DFS_NAMENODE_MAX_OBJECTS_DEFAULT); 788 789 this.minBlockSize = conf.getLong(DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY, 790 DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_DEFAULT); 791 this.maxBlocksPerFile = conf.getLong(DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_KEY, 792 DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_DEFAULT); 793 this.accessTimePrecision = conf.getLong(DFS_NAMENODE_ACCESSTIME_PRECISION_KEY, 794 DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT); 795 this.supportAppends = conf.getBoolean(DFS_SUPPORT_APPEND_KEY, DFS_SUPPORT_APPEND_DEFAULT); 796 LOG.info("Append Enabled: " + supportAppends); 797 798 this.dtpReplaceDatanodeOnFailure = ReplaceDatanodeOnFailure.get(conf); 799 800 this.standbyShouldCheckpoint = conf.getBoolean( 801 DFS_HA_STANDBY_CHECKPOINTS_KEY, DFS_HA_STANDBY_CHECKPOINTS_DEFAULT); 802 // # edit autoroll threshold is a multiple of the checkpoint threshold 803 this.editLogRollerThreshold = (long) 804 (conf.getFloat( 805 DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD, 806 DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD_DEFAULT) * 807 conf.getLong( 808 DFS_NAMENODE_CHECKPOINT_TXNS_KEY, 809 DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT)); 810 this.editLogRollerInterval = conf.getInt( 811 DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS, 812 DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS_DEFAULT); 813 814 this.lazyPersistFileScrubIntervalSec = conf.getInt( 815 DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC, 816 DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC_DEFAULT); 817 818 if (this.lazyPersistFileScrubIntervalSec == 0) { 819 throw new IllegalArgumentException( 820 DFS_NAMENODE_LAZY_PERSIST_FILE_SCRUB_INTERVAL_SEC + " must be non-zero."); 821 } 822 823 // For testing purposes, allow the DT secret manager to be started regardless 824 // of whether security is enabled. 825 alwaysUseDelegationTokensForTests = conf.getBoolean( 826 DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, 827 DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT); 828 829 this.dtSecretManager = createDelegationTokenSecretManager(conf); 830 this.dir = new FSDirectory(this, conf); 831 this.snapshotManager = new SnapshotManager(dir); 832 this.cacheManager = new CacheManager(this, conf, blockManager); 833 this.safeMode = new SafeModeInfo(conf); 834 this.topConf = new TopConf(conf); 835 this.auditLoggers = initAuditLoggers(conf); 836 this.isDefaultAuditLogger = auditLoggers.size() == 1 && 837 auditLoggers.get(0) instanceof DefaultAuditLogger; 838 this.retryCache = ignoreRetryCache ? null : initRetryCache(conf); 839 Class<? extends INodeAttributeProvider> klass = conf.getClass( 840 DFS_NAMENODE_INODE_ATTRIBUTES_PROVIDER_KEY, 841 null, INodeAttributeProvider.class); 842 if (klass != null) { 843 inodeAttributeProvider = ReflectionUtils.newInstance(klass, conf); 844 LOG.info("Using INode attribute provider: " + klass.getName()); 845 } 846 } catch(IOException e) { 847 LOG.error(getClass().getSimpleName() + " initialization failed.", e); 848 close(); 849 throw e; 850 } catch (RuntimeException re) { 851 LOG.error(getClass().getSimpleName() + " initialization failed.", re); 852 close(); 853 throw re; 854 } 855 } 856 857 @VisibleForTesting 858 public List<AuditLogger> getAuditLoggers() { 859 return auditLoggers; 860 } 861 862 @VisibleForTesting 863 public RetryCache getRetryCache() { 864 return retryCache; 865 } 866 867 void lockRetryCache() { 868 if (retryCache != null) { 869 retryCache.lock(); 870 } 871 } 872 873 void unlockRetryCache() { 874 if (retryCache != null) { 875 retryCache.unlock(); 876 } 877 } 878 879 /** Whether or not retry cache is enabled */ 880 boolean hasRetryCache() { 881 return retryCache != null; 882 } 883 884 void addCacheEntryWithPayload(byte[] clientId, int callId, Object payload) { 885 if (retryCache != null) { 886 retryCache.addCacheEntryWithPayload(clientId, callId, payload); 887 } 888 } 889 890 void addCacheEntry(byte[] clientId, int callId) { 891 if (retryCache != null) { 892 retryCache.addCacheEntry(clientId, callId); 893 } 894 } 895 896 @VisibleForTesting 897 public KeyProviderCryptoExtension getProvider() { 898 return provider; 899 } 900 901 @VisibleForTesting 902 static RetryCache initRetryCache(Configuration conf) { 903 boolean enable = conf.getBoolean(DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY, 904 DFS_NAMENODE_ENABLE_RETRY_CACHE_DEFAULT); 905 LOG.info("Retry cache on namenode is " + (enable ? "enabled" : "disabled")); 906 if (enable) { 907 float heapPercent = conf.getFloat( 908 DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_KEY, 909 DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_DEFAULT); 910 long entryExpiryMillis = conf.getLong( 911 DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_KEY, 912 DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_DEFAULT); 913 LOG.info("Retry cache will use " + heapPercent 914 + " of total heap and retry cache entry expiry time is " 915 + entryExpiryMillis + " millis"); 916 long entryExpiryNanos = entryExpiryMillis * 1000 * 1000; 917 return new RetryCache("NameNodeRetryCache", heapPercent, 918 entryExpiryNanos); 919 } 920 return null; 921 } 922 923 private List<AuditLogger> initAuditLoggers(Configuration conf) { 924 // Initialize the custom access loggers if configured. 925 Collection<String> alClasses = conf.getStringCollection(DFS_NAMENODE_AUDIT_LOGGERS_KEY); 926 List<AuditLogger> auditLoggers = Lists.newArrayList(); 927 if (alClasses != null && !alClasses.isEmpty()) { 928 for (String className : alClasses) { 929 try { 930 AuditLogger logger; 931 if (DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME.equals(className)) { 932 logger = new DefaultAuditLogger(); 933 } else { 934 logger = (AuditLogger) Class.forName(className).newInstance(); 935 } 936 logger.initialize(conf); 937 auditLoggers.add(logger); 938 } catch (RuntimeException re) { 939 throw re; 940 } catch (Exception e) { 941 throw new RuntimeException(e); 942 } 943 } 944 } 945 946 // Make sure there is at least one logger installed. 947 if (auditLoggers.isEmpty()) { 948 auditLoggers.add(new DefaultAuditLogger()); 949 } 950 951 // Add audit logger to calculate top users 952 if (topConf.isEnabled) { 953 topMetrics = new TopMetrics(conf, topConf.nntopReportingPeriodsMs); 954 auditLoggers.add(new TopAuditLogger(topMetrics)); 955 } 956 957 return Collections.unmodifiableList(auditLoggers); 958 } 959 960 private void loadFSImage(StartupOption startOpt) throws IOException { 961 final FSImage fsImage = getFSImage(); 962 963 // format before starting up if requested 964 if (startOpt == StartupOption.FORMAT) { 965 966 fsImage.format(this, fsImage.getStorage().determineClusterId());// reuse current id 967 968 startOpt = StartupOption.REGULAR; 969 } 970 boolean success = false; 971 writeLock(); 972 try { 973 // We shouldn't be calling saveNamespace if we've come up in standby state. 974 MetaRecoveryContext recovery = startOpt.createRecoveryContext(); 975 final boolean staleImage 976 = fsImage.recoverTransitionRead(startOpt, this, recovery); 977 if (RollingUpgradeStartupOption.ROLLBACK.matches(startOpt) || 978 RollingUpgradeStartupOption.DOWNGRADE.matches(startOpt)) { 979 rollingUpgradeInfo = null; 980 } 981 final boolean needToSave = staleImage && !haEnabled && !isRollingUpgrade(); 982 LOG.info("Need to save fs image? " + needToSave 983 + " (staleImage=" + staleImage + ", haEnabled=" + haEnabled 984 + ", isRollingUpgrade=" + isRollingUpgrade() + ")"); 985 if (needToSave) { 986 fsImage.saveNamespace(this); 987 } else { 988 updateStorageVersionForRollingUpgrade(fsImage.getLayoutVersion(), 989 startOpt); 990 // No need to save, so mark the phase done. 991 StartupProgress prog = NameNode.getStartupProgress(); 992 prog.beginPhase(Phase.SAVING_CHECKPOINT); 993 prog.endPhase(Phase.SAVING_CHECKPOINT); 994 } 995 // This will start a new log segment and write to the seen_txid file, so 996 // we shouldn't do it when coming up in standby state 997 if (!haEnabled || (haEnabled && startOpt == StartupOption.UPGRADE) 998 || (haEnabled && startOpt == StartupOption.UPGRADEONLY)) { 999 fsImage.openEditLogForWrite(); 1000 } 1001 success = true; 1002 } finally { 1003 if (!success) { 1004 fsImage.close(); 1005 } 1006 writeUnlock(); 1007 } 1008 imageLoadComplete(); 1009 } 1010 1011 private void updateStorageVersionForRollingUpgrade(final long layoutVersion, 1012 StartupOption startOpt) throws IOException { 1013 boolean rollingStarted = RollingUpgradeStartupOption.STARTED 1014 .matches(startOpt) && layoutVersion > HdfsConstants 1015 .NAMENODE_LAYOUT_VERSION; 1016 boolean rollingRollback = RollingUpgradeStartupOption.ROLLBACK 1017 .matches(startOpt); 1018 if (rollingRollback || rollingStarted) { 1019 fsImage.updateStorageVersion(); 1020 } 1021 } 1022 1023 private void startSecretManager() { 1024 if (dtSecretManager != null) { 1025 try { 1026 dtSecretManager.startThreads(); 1027 } catch (IOException e) { 1028 // Inability to start secret manager 1029 // can't be recovered from. 1030 throw new RuntimeException(e); 1031 } 1032 } 1033 } 1034 1035 private void startSecretManagerIfNecessary() { 1036 boolean shouldRun = shouldUseDelegationTokens() && 1037 !isInSafeMode() && getEditLog().isOpenForWrite(); 1038 boolean running = dtSecretManager.isRunning(); 1039 if (shouldRun && !running) { 1040 startSecretManager(); 1041 } 1042 } 1043 1044 private void stopSecretManager() { 1045 if (dtSecretManager != null) { 1046 dtSecretManager.stopThreads(); 1047 } 1048 } 1049 1050 /** 1051 * Start services common to both active and standby states 1052 */ 1053 void startCommonServices(Configuration conf, HAContext haContext) throws IOException { 1054 this.registerMBean(); // register the MBean for the FSNamesystemState 1055 writeLock(); 1056 this.haContext = haContext; 1057 try { 1058 nnResourceChecker = new NameNodeResourceChecker(conf); 1059 checkAvailableResources(); 1060 assert safeMode != null && !isPopulatingReplQueues(); 1061 StartupProgress prog = NameNode.getStartupProgress(); 1062 prog.beginPhase(Phase.SAFEMODE); 1063 prog.setTotal(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS, 1064 getCompleteBlocksTotal()); 1065 setBlockTotal(); 1066 blockManager.activate(conf); 1067 } finally { 1068 writeUnlock(); 1069 } 1070 1071 registerMXBean(); 1072 DefaultMetricsSystem.instance().register(this); 1073 if (inodeAttributeProvider != null) { 1074 inodeAttributeProvider.start(); 1075 dir.setINodeAttributeProvider(inodeAttributeProvider); 1076 } 1077 snapshotManager.registerMXBean(); 1078 } 1079 1080 /** 1081 * Stop services common to both active and standby states 1082 */ 1083 void stopCommonServices() { 1084 writeLock(); 1085 if (inodeAttributeProvider != null) { 1086 dir.setINodeAttributeProvider(null); 1087 inodeAttributeProvider.stop(); 1088 } 1089 try { 1090 if (blockManager != null) blockManager.close(); 1091 } finally { 1092 writeUnlock(); 1093 } 1094 RetryCache.clear(retryCache); 1095 } 1096 1097 /** 1098 * Start services required in active state 1099 * @throws IOException 1100 */ 1101 void startActiveServices() throws IOException { 1102 startingActiveService = true; 1103 LOG.info("Starting services required for active state"); 1104 writeLock(); 1105 try { 1106 FSEditLog editLog = getFSImage().getEditLog(); 1107 1108 if (!editLog.isOpenForWrite()) { 1109 // During startup, we're already open for write during initialization. 1110 editLog.initJournalsForWrite(); 1111 // May need to recover 1112 editLog.recoverUnclosedStreams(); 1113 1114 LOG.info("Catching up to latest edits from old active before " + 1115 "taking over writer role in edits logs"); 1116 editLogTailer.catchupDuringFailover(); 1117 1118 blockManager.setPostponeBlocksFromFuture(false); 1119 blockManager.getDatanodeManager().markAllDatanodesStale(); 1120 blockManager.clearQueues(); 1121 blockManager.processAllPendingDNMessages(); 1122 1123 // Only need to re-process the queue, If not in SafeMode. 1124 if (!isInSafeMode()) { 1125 LOG.info("Reprocessing replication and invalidation queues"); 1126 initializeReplQueues(); 1127 } 1128 1129 if (LOG.isDebugEnabled()) { 1130 LOG.debug("NameNode metadata after re-processing " + 1131 "replication and invalidation queues during failover:\n" + 1132 metaSaveAsString()); 1133 } 1134 1135 long nextTxId = getFSImage().getLastAppliedTxId() + 1; 1136 LOG.info("Will take over writing edit logs at txnid " + 1137 nextTxId); 1138 editLog.setNextTxId(nextTxId); 1139 1140 getFSImage().editLog.openForWrite(); 1141 } 1142 1143 // Enable quota checks. 1144 dir.enableQuotaChecks(); 1145 if (haEnabled) { 1146 // Renew all of the leases before becoming active. 1147 // This is because, while we were in standby mode, 1148 // the leases weren't getting renewed on this NN. 1149 // Give them all a fresh start here. 1150 leaseManager.renewAllLeases(); 1151 } 1152 leaseManager.startMonitor(); 1153 startSecretManagerIfNecessary(); 1154 1155 //ResourceMonitor required only at ActiveNN. See HDFS-2914 1156 this.nnrmthread = new Daemon(new NameNodeResourceMonitor()); 1157 nnrmthread.start(); 1158 1159 nnEditLogRoller = new Daemon(new NameNodeEditLogRoller( 1160 editLogRollerThreshold, editLogRollerInterval)); 1161 nnEditLogRoller.start(); 1162 1163 if (lazyPersistFileScrubIntervalSec > 0) { 1164 lazyPersistFileScrubber = new Daemon(new LazyPersistFileScrubber( 1165 lazyPersistFileScrubIntervalSec)); 1166 lazyPersistFileScrubber.start(); 1167 } 1168 1169 cacheManager.startMonitorThread(); 1170 blockManager.getDatanodeManager().setShouldSendCachingCommands(true); 1171 } finally { 1172 startingActiveService = false; 1173 checkSafeMode(); 1174 writeUnlock(); 1175 } 1176 } 1177 1178 /** 1179 * Initialize replication queues. 1180 */ 1181 private void initializeReplQueues() { 1182 LOG.info("initializing replication queues"); 1183 blockManager.processMisReplicatedBlocks(); 1184 initializedReplQueues = true; 1185 } 1186 1187 private boolean inActiveState() { 1188 return haContext != null && 1189 haContext.getState().getServiceState() == HAServiceState.ACTIVE; 1190 } 1191 1192 /** 1193 * @return Whether the namenode is transitioning to active state and is in the 1194 * middle of the {@link #startActiveServices()} 1195 */ 1196 public boolean inTransitionToActive() { 1197 return haEnabled && inActiveState() && startingActiveService; 1198 } 1199 1200 private boolean shouldUseDelegationTokens() { 1201 return UserGroupInformation.isSecurityEnabled() || 1202 alwaysUseDelegationTokensForTests; 1203 } 1204 1205 /** 1206 * Stop services required in active state 1207 */ 1208 void stopActiveServices() { 1209 LOG.info("Stopping services started for active state"); 1210 writeLock(); 1211 try { 1212 stopSecretManager(); 1213 leaseManager.stopMonitor(); 1214 if (nnrmthread != null) { 1215 ((NameNodeResourceMonitor) nnrmthread.getRunnable()).stopMonitor(); 1216 nnrmthread.interrupt(); 1217 } 1218 if (nnEditLogRoller != null) { 1219 ((NameNodeEditLogRoller)nnEditLogRoller.getRunnable()).stop(); 1220 nnEditLogRoller.interrupt(); 1221 } 1222 if (lazyPersistFileScrubber != null) { 1223 ((LazyPersistFileScrubber) lazyPersistFileScrubber.getRunnable()).stop(); 1224 lazyPersistFileScrubber.interrupt(); 1225 } 1226 if (dir != null && getFSImage() != null) { 1227 if (getFSImage().editLog != null) { 1228 getFSImage().editLog.close(); 1229 } 1230 // Update the fsimage with the last txid that we wrote 1231 // so that the tailer starts from the right spot. 1232 getFSImage().updateLastAppliedTxIdFromWritten(); 1233 } 1234 if (cacheManager != null) { 1235 cacheManager.stopMonitorThread(); 1236 cacheManager.clearDirectiveStats(); 1237 } 1238 blockManager.getDatanodeManager().clearPendingCachingCommands(); 1239 blockManager.getDatanodeManager().setShouldSendCachingCommands(false); 1240 // Don't want to keep replication queues when not in Active. 1241 blockManager.clearQueues(); 1242 initializedReplQueues = false; 1243 } finally { 1244 writeUnlock(); 1245 } 1246 } 1247 1248 /** 1249 * Start services required in standby state 1250 * 1251 * @throws IOException 1252 */ 1253 void startStandbyServices(final Configuration conf) throws IOException { 1254 LOG.info("Starting services required for standby state"); 1255 if (!getFSImage().editLog.isOpenForRead()) { 1256 // During startup, we're already open for read. 1257 getFSImage().editLog.initSharedJournalsForRead(); 1258 } 1259 1260 blockManager.setPostponeBlocksFromFuture(true); 1261 1262 // Disable quota checks while in standby. 1263 dir.disableQuotaChecks(); 1264 editLogTailer = new EditLogTailer(this, conf); 1265 editLogTailer.start(); 1266 if (standbyShouldCheckpoint) { 1267 standbyCheckpointer = new StandbyCheckpointer(conf, this); 1268 standbyCheckpointer.start(); 1269 } 1270 } 1271 1272 /** 1273 * Called when the NN is in Standby state and the editlog tailer tails the 1274 * OP_ROLLING_UPGRADE_START. 1275 */ 1276 void triggerRollbackCheckpoint() { 1277 setNeedRollbackFsImage(true); 1278 if (standbyCheckpointer != null) { 1279 standbyCheckpointer.triggerRollbackCheckpoint(); 1280 } 1281 } 1282 1283 /** 1284 * Called while the NN is in Standby state, but just about to be 1285 * asked to enter Active state. This cancels any checkpoints 1286 * currently being taken. 1287 */ 1288 void prepareToStopStandbyServices() throws ServiceFailedException { 1289 if (standbyCheckpointer != null) { 1290 standbyCheckpointer.cancelAndPreventCheckpoints( 1291 "About to leave standby state"); 1292 } 1293 } 1294 1295 /** Stop services required in standby state */ 1296 void stopStandbyServices() throws IOException { 1297 LOG.info("Stopping services started for standby state"); 1298 if (standbyCheckpointer != null) { 1299 standbyCheckpointer.stop(); 1300 } 1301 if (editLogTailer != null) { 1302 editLogTailer.stop(); 1303 } 1304 if (dir != null && getFSImage() != null && getFSImage().editLog != null) { 1305 getFSImage().editLog.close(); 1306 } 1307 } 1308 1309 @Override 1310 public void checkOperation(OperationCategory op) throws StandbyException { 1311 if (haContext != null) { 1312 // null in some unit tests 1313 haContext.checkOperation(op); 1314 } 1315 } 1316 1317 /** 1318 * @throws RetriableException 1319 * If 1) The NameNode is in SafeMode, 2) HA is enabled, and 3) 1320 * NameNode is in active state 1321 * @throws SafeModeException 1322 * Otherwise if NameNode is in SafeMode. 1323 */ 1324 void checkNameNodeSafeMode(String errorMsg) 1325 throws RetriableException, SafeModeException { 1326 if (isInSafeMode()) { 1327 SafeModeException se = new SafeModeException(errorMsg, safeMode); 1328 if (haEnabled && haContext != null 1329 && haContext.getState().getServiceState() == HAServiceState.ACTIVE 1330 && shouldRetrySafeMode(this.safeMode)) { 1331 throw new RetriableException(se); 1332 } else { 1333 throw se; 1334 } 1335 } 1336 } 1337 1338 boolean isPermissionEnabled() { 1339 return isPermissionEnabled; 1340 } 1341 1342 /** 1343 * We already know that the safemode is on. We will throw a RetriableException 1344 * if the safemode is not manual or caused by low resource. 1345 */ 1346 private boolean shouldRetrySafeMode(SafeModeInfo safeMode) { 1347 if (safeMode == null) { 1348 return false; 1349 } else { 1350 return !safeMode.isManual() && !safeMode.areResourcesLow(); 1351 } 1352 } 1353 1354 public static Collection<URI> getNamespaceDirs(Configuration conf) { 1355 return getStorageDirs(conf, DFS_NAMENODE_NAME_DIR_KEY); 1356 } 1357 1358 /** 1359 * Get all edits dirs which are required. If any shared edits dirs are 1360 * configured, these are also included in the set of required dirs. 1361 * 1362 * @param conf the HDFS configuration. 1363 * @return all required dirs. 1364 */ 1365 public static Collection<URI> getRequiredNamespaceEditsDirs(Configuration conf) { 1366 Set<URI> ret = new HashSet<URI>(); 1367 ret.addAll(getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY)); 1368 ret.addAll(getSharedEditsDirs(conf)); 1369 return ret; 1370 } 1371 1372 private static Collection<URI> getStorageDirs(Configuration conf, 1373 String propertyName) { 1374 Collection<String> dirNames = conf.getTrimmedStringCollection(propertyName); 1375 StartupOption startOpt = NameNode.getStartupOption(conf); 1376 if(startOpt == StartupOption.IMPORT) { 1377 // In case of IMPORT this will get rid of default directories 1378 // but will retain directories specified in hdfs-site.xml 1379 // When importing image from a checkpoint, the name-node can 1380 // start with empty set of storage directories. 1381 Configuration cE = new HdfsConfiguration(false); 1382 cE.addResource("core-default.xml"); 1383 cE.addResource("core-site.xml"); 1384 cE.addResource("hdfs-default.xml"); 1385 Collection<String> dirNames2 = cE.getTrimmedStringCollection(propertyName); 1386 dirNames.removeAll(dirNames2); 1387 if(dirNames.isEmpty()) 1388 LOG.warn("!!! WARNING !!!" + 1389 "\n\tThe NameNode currently runs without persistent storage." + 1390 "\n\tAny changes to the file system meta-data may be lost." + 1391 "\n\tRecommended actions:" + 1392 "\n\t\t- shutdown and restart NameNode with configured \"" 1393 + propertyName + "\" in hdfs-site.xml;" + 1394 "\n\t\t- use Backup Node as a persistent and up-to-date storage " + 1395 "of the file system meta-data."); 1396 } else if (dirNames.isEmpty()) { 1397 dirNames = Collections.singletonList( 1398 DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_DEFAULT); 1399 } 1400 return Util.stringCollectionAsURIs(dirNames); 1401 } 1402 1403 /** 1404 * Return an ordered list of edits directories to write to. 1405 * The list is ordered such that all shared edits directories 1406 * are ordered before non-shared directories, and any duplicates 1407 * are removed. The order they are specified in the configuration 1408 * is retained. 1409 * @return Collection of shared edits directories. 1410 * @throws IOException if multiple shared edits directories are configured 1411 */ 1412 public static List<URI> getNamespaceEditsDirs(Configuration conf) 1413 throws IOException { 1414 return getNamespaceEditsDirs(conf, true); 1415 } 1416 1417 public static List<URI> getNamespaceEditsDirs(Configuration conf, 1418 boolean includeShared) 1419 throws IOException { 1420 // Use a LinkedHashSet so that order is maintained while we de-dup 1421 // the entries. 1422 LinkedHashSet<URI> editsDirs = new LinkedHashSet<URI>(); 1423 1424 if (includeShared) { 1425 List<URI> sharedDirs = getSharedEditsDirs(conf); 1426 1427 // Fail until multiple shared edits directories are supported (HDFS-2782) 1428 if (sharedDirs.size() > 1) { 1429 throw new IOException( 1430 "Multiple shared edits directories are not yet supported"); 1431 } 1432 1433 // First add the shared edits dirs. It's critical that the shared dirs 1434 // are added first, since JournalSet syncs them in the order they are listed, 1435 // and we need to make sure all edits are in place in the shared storage 1436 // before they are replicated locally. See HDFS-2874. 1437 for (URI dir : sharedDirs) { 1438 if (!editsDirs.add(dir)) { 1439 LOG.warn("Edits URI " + dir + " listed multiple times in " + 1440 DFS_NAMENODE_SHARED_EDITS_DIR_KEY + ". Ignoring duplicates."); 1441 } 1442 } 1443 } 1444 // Now add the non-shared dirs. 1445 for (URI dir : getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_KEY)) { 1446 if (!editsDirs.add(dir)) { 1447 LOG.warn("Edits URI " + dir + " listed multiple times in " + 1448 DFS_NAMENODE_SHARED_EDITS_DIR_KEY + " and " + 1449 DFS_NAMENODE_EDITS_DIR_KEY + ". Ignoring duplicates."); 1450 } 1451 } 1452 1453 if (editsDirs.isEmpty()) { 1454 // If this is the case, no edit dirs have been explicitly configured. 1455 // Image dirs are to be used for edits too. 1456 return Lists.newArrayList(getNamespaceDirs(conf)); 1457 } else { 1458 return Lists.newArrayList(editsDirs); 1459 } 1460 } 1461 1462 /** 1463 * Returns edit directories that are shared between primary and secondary. 1464 * @param conf configuration 1465 * @return collection of edit directories from {@code conf} 1466 */ 1467 public static List<URI> getSharedEditsDirs(Configuration conf) { 1468 // don't use getStorageDirs here, because we want an empty default 1469 // rather than the dir in /tmp 1470 Collection<String> dirNames = conf.getTrimmedStringCollection( 1471 DFS_NAMENODE_SHARED_EDITS_DIR_KEY); 1472 return Util.stringCollectionAsURIs(dirNames); 1473 } 1474 1475 @Override 1476 public void readLock() { 1477 this.fsLock.readLock().lock(); 1478 } 1479 @Override 1480 public void readUnlock() { 1481 this.fsLock.readLock().unlock(); 1482 } 1483 @Override 1484 public void writeLock() { 1485 this.fsLock.writeLock().lock(); 1486 } 1487 @Override 1488 public void writeLockInterruptibly() throws InterruptedException { 1489 this.fsLock.writeLock().lockInterruptibly(); 1490 } 1491 @Override 1492 public void writeUnlock() { 1493 this.fsLock.writeLock().unlock(); 1494 } 1495 @Override 1496 public boolean hasWriteLock() { 1497 return this.fsLock.isWriteLockedByCurrentThread(); 1498 } 1499 @Override 1500 public boolean hasReadLock() { 1501 return this.fsLock.getReadHoldCount() > 0 || hasWriteLock(); 1502 } 1503 1504 public int getReadHoldCount() { 1505 return this.fsLock.getReadHoldCount(); 1506 } 1507 1508 public int getWriteHoldCount() { 1509 return this.fsLock.getWriteHoldCount(); 1510 } 1511 1512 /** Lock the checkpoint lock */ 1513 public void cpLock() { 1514 this.cpLock.lock(); 1515 } 1516 1517 /** Lock the checkpoint lock interrupibly */ 1518 public void cpLockInterruptibly() throws InterruptedException { 1519 this.cpLock.lockInterruptibly(); 1520 } 1521 1522 /** Unlock the checkpoint lock */ 1523 public void cpUnlock() { 1524 this.cpLock.unlock(); 1525 } 1526 1527 1528 NamespaceInfo getNamespaceInfo() { 1529 readLock(); 1530 try { 1531 return unprotectedGetNamespaceInfo(); 1532 } finally { 1533 readUnlock(); 1534 } 1535 } 1536 1537 /** 1538 * Version of @see #getNamespaceInfo() that is not protected by a lock. 1539 */ 1540 NamespaceInfo unprotectedGetNamespaceInfo() { 1541 return new NamespaceInfo(getFSImage().getStorage().getNamespaceID(), 1542 getClusterId(), getBlockPoolId(), 1543 getFSImage().getStorage().getCTime()); 1544 } 1545 1546 /** 1547 * Close down this file system manager. 1548 * Causes heartbeat and lease daemons to stop; waits briefly for 1549 * them to finish, but a short timeout returns control back to caller. 1550 */ 1551 void close() { 1552 fsRunning = false; 1553 try { 1554 stopCommonServices(); 1555 if (smmthread != null) smmthread.interrupt(); 1556 } finally { 1557 // using finally to ensure we also wait for lease daemon 1558 try { 1559 stopActiveServices(); 1560 stopStandbyServices(); 1561 } catch (IOException ie) { 1562 } finally { 1563 IOUtils.cleanup(LOG, dir); 1564 IOUtils.cleanup(LOG, fsImage); 1565 } 1566 } 1567 } 1568 1569 @Override 1570 public boolean isRunning() { 1571 return fsRunning; 1572 } 1573 1574 @Override 1575 public boolean isInStandbyState() { 1576 if (haContext == null || haContext.getState() == null) { 1577 // We're still starting up. In this case, if HA is 1578 // on for the cluster, we always start in standby. Otherwise 1579 // start in active. 1580 return haEnabled; 1581 } 1582 1583 return HAServiceState.STANDBY == haContext.getState().getServiceState(); 1584 } 1585 1586 /** 1587 * Dump all metadata into specified file 1588 */ 1589 void metaSave(String filename) throws IOException { 1590 checkSuperuserPrivilege(); 1591 checkOperation(OperationCategory.UNCHECKED); 1592 writeLock(); 1593 try { 1594 checkOperation(OperationCategory.UNCHECKED); 1595 File file = new File(System.getProperty("hadoop.log.dir"), filename); 1596 PrintWriter out = new PrintWriter(new BufferedWriter( 1597 new OutputStreamWriter(new FileOutputStream(file), Charsets.UTF_8))); 1598 metaSave(out); 1599 out.flush(); 1600 out.close(); 1601 } finally { 1602 writeUnlock(); 1603 } 1604 } 1605 1606 private void metaSave(PrintWriter out) { 1607 assert hasWriteLock(); 1608 long totalInodes = this.dir.totalInodes(); 1609 long totalBlocks = this.getBlocksTotal(); 1610 out.println(totalInodes + " files and directories, " + totalBlocks 1611 + " blocks = " + (totalInodes + totalBlocks) + " total"); 1612 1613 blockManager.metaSave(out); 1614 } 1615 1616 private String metaSaveAsString() { 1617 StringWriter sw = new StringWriter(); 1618 PrintWriter pw = new PrintWriter(sw); 1619 metaSave(pw); 1620 pw.flush(); 1621 return sw.toString(); 1622 } 1623 1624 FsServerDefaults getServerDefaults() throws StandbyException { 1625 checkOperation(OperationCategory.READ); 1626 return serverDefaults; 1627 } 1628 1629 long getAccessTimePrecision() { 1630 return accessTimePrecision; 1631 } 1632 1633 private boolean isAccessTimeSupported() { 1634 return accessTimePrecision > 0; 1635 } 1636 1637 ///////////////////////////////////////////////////////// 1638 // 1639 // These methods are called by HadoopFS clients 1640 // 1641 ///////////////////////////////////////////////////////// 1642 /** 1643 * Set permissions for an existing file. 1644 * @throws IOException 1645 */ 1646 void setPermission(String src, FsPermission permission) throws IOException { 1647 HdfsFileStatus auditStat; 1648 checkOperation(OperationCategory.WRITE); 1649 writeLock(); 1650 try { 1651 checkOperation(OperationCategory.WRITE); 1652 checkNameNodeSafeMode("Cannot set permission for " + src); 1653 auditStat = FSDirAttrOp.setPermission(dir, src, permission); 1654 } catch (AccessControlException e) { 1655 logAuditEvent(false, "setPermission", src); 1656 throw e; 1657 } finally { 1658 writeUnlock(); 1659 } 1660 getEditLog().logSync(); 1661 logAuditEvent(true, "setPermission", src, null, auditStat); 1662 } 1663 1664 /** 1665 * Set owner for an existing file. 1666 * @throws IOException 1667 */ 1668 void setOwner(String src, String username, String group) 1669 throws IOException { 1670 HdfsFileStatus auditStat; 1671 checkOperation(OperationCategory.WRITE); 1672 writeLock(); 1673 try { 1674 checkOperation(OperationCategory.WRITE); 1675 checkNameNodeSafeMode("Cannot set owner for " + src); 1676 auditStat = FSDirAttrOp.setOwner(dir, src, username, group); 1677 } catch (AccessControlException e) { 1678 logAuditEvent(false, "setOwner", src); 1679 throw e; 1680 } finally { 1681 writeUnlock(); 1682 } 1683 getEditLog().logSync(); 1684 logAuditEvent(true, "setOwner", src, null, auditStat); 1685 } 1686 1687 static class GetBlockLocationsResult { 1688 final boolean updateAccessTime; 1689 final LocatedBlocks blocks; 1690 boolean updateAccessTime() { 1691 return updateAccessTime; 1692 } 1693 private GetBlockLocationsResult( 1694 boolean updateAccessTime, LocatedBlocks blocks) { 1695 this.updateAccessTime = updateAccessTime; 1696 this.blocks = blocks; 1697 } 1698 } 1699 1700 /** 1701 * Get block locations within the specified range. 1702 * @see ClientProtocol#getBlockLocations(String, long, long) 1703 */ 1704 LocatedBlocks getBlockLocations(String clientMachine, String srcArg, 1705 long offset, long length) throws IOException { 1706 checkOperation(OperationCategory.READ); 1707 GetBlockLocationsResult res = null; 1708 FSPermissionChecker pc = getPermissionChecker(); 1709 readLock(); 1710 try { 1711 checkOperation(OperationCategory.READ); 1712 res = getBlockLocations(pc, srcArg, offset, length, true, true); 1713 } catch (AccessControlException e) { 1714 logAuditEvent(false, "open", srcArg); 1715 throw e; 1716 } finally { 1717 readUnlock(); 1718 } 1719 1720 logAuditEvent(true, "open", srcArg); 1721 1722 if (res.updateAccessTime()) { 1723 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath( 1724 srcArg); 1725 String src = srcArg; 1726 writeLock(); 1727 final long now = now(); 1728 try { 1729 checkOperation(OperationCategory.WRITE); 1730 /** 1731 * Resolve the path again and update the atime only when the file 1732 * exists. 1733 * 1734 * XXX: Races can still occur even after resolving the path again. 1735 * For example: 1736 * 1737 * <ul> 1738 * <li>Get the block location for "/a/b"</li> 1739 * <li>Rename "/a/b" to "/c/b"</li> 1740 * <li>The second resolution still points to "/a/b", which is 1741 * wrong.</li> 1742 * </ul> 1743 * 1744 * The behavior is incorrect but consistent with the one before 1745 * HDFS-7463. A better fix is to change the edit log of SetTime to 1746 * use inode id instead of a path. 1747 */ 1748 src = dir.resolvePath(pc, srcArg, pathComponents); 1749 final INodesInPath iip = dir.getINodesInPath(src, true); 1750 INode inode = iip.getLastINode(); 1751 boolean updateAccessTime = inode != null && 1752 now > inode.getAccessTime() + getAccessTimePrecision(); 1753 if (!isInSafeMode() && updateAccessTime) { 1754 boolean changed = FSDirAttrOp.setTimes(dir, 1755 inode, -1, now, false, iip.getLatestSnapshotId()); 1756 if (changed) { 1757 getEditLog().logTimes(src, -1, now); 1758 } 1759 } 1760 } catch (Throwable e) { 1761 LOG.warn("Failed to update the access time of " + src, e); 1762 } finally { 1763 writeUnlock(); 1764 } 1765 } 1766 1767 LocatedBlocks blocks = res.blocks; 1768 if (blocks != null) { 1769 blockManager.getDatanodeManager().sortLocatedBlocks( 1770 clientMachine, blocks.getLocatedBlocks()); 1771 1772 // lastBlock is not part of getLocatedBlocks(), might need to sort it too 1773 LocatedBlock lastBlock = blocks.getLastLocatedBlock(); 1774 if (lastBlock != null) { 1775 ArrayList<LocatedBlock> lastBlockList = Lists.newArrayList(lastBlock); 1776 blockManager.getDatanodeManager().sortLocatedBlocks( 1777 clientMachine, lastBlockList); 1778 } 1779 } 1780 return blocks; 1781 } 1782 1783 /** 1784 * Get block locations within the specified range. 1785 * @see ClientProtocol#getBlockLocations(String, long, long) 1786 * @throws IOException 1787 */ 1788 GetBlockLocationsResult getBlockLocations( 1789 FSPermissionChecker pc, String src, long offset, long length, 1790 boolean needBlockToken, boolean checkSafeMode) throws IOException { 1791 if (offset < 0) { 1792 throw new HadoopIllegalArgumentException( 1793 "Negative offset is not supported. File: " + src); 1794 } 1795 if (length < 0) { 1796 throw new HadoopIllegalArgumentException( 1797 "Negative length is not supported. File: " + src); 1798 } 1799 final GetBlockLocationsResult ret = getBlockLocationsInt( 1800 pc, src, offset, length, needBlockToken); 1801 1802 if (checkSafeMode && isInSafeMode()) { 1803 for (LocatedBlock b : ret.blocks.getLocatedBlocks()) { 1804 // if safemode & no block locations yet then throw safemodeException 1805 if ((b.getLocations() == null) || (b.getLocations().length == 0)) { 1806 SafeModeException se = new SafeModeException( 1807 "Zero blocklocations for " + src, safeMode); 1808 if (haEnabled && haContext != null && 1809 haContext.getState().getServiceState() == HAServiceState.ACTIVE) { 1810 throw new RetriableException(se); 1811 } else { 1812 throw se; 1813 } 1814 } 1815 } 1816 } 1817 return ret; 1818 } 1819 1820 private GetBlockLocationsResult getBlockLocationsInt( 1821 FSPermissionChecker pc, final String srcArg, long offset, long length, 1822 boolean needBlockToken) 1823 throws IOException { 1824 String src = srcArg; 1825 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src); 1826 src = dir.resolvePath(pc, srcArg, pathComponents); 1827 final INodesInPath iip = dir.getINodesInPath(src, true); 1828 final INodeFile inode = INodeFile.valueOf(iip.getLastINode(), src); 1829 if (isPermissionEnabled) { 1830 dir.checkPathAccess(pc, iip, FsAction.READ); 1831 checkUnreadableBySuperuser(pc, inode, iip.getPathSnapshotId()); 1832 } 1833 1834 final long fileSize = iip.isSnapshot() 1835 ? inode.computeFileSize(iip.getPathSnapshotId()) 1836 : inode.computeFileSizeNotIncludingLastUcBlock(); 1837 boolean isUc = inode.isUnderConstruction(); 1838 if (iip.isSnapshot()) { 1839 // if src indicates a snapshot file, we need to make sure the returned 1840 // blocks do not exceed the size of the snapshot file. 1841 length = Math.min(length, fileSize - offset); 1842 isUc = false; 1843 } 1844 1845 final FileEncryptionInfo feInfo = 1846 FSDirectory.isReservedRawName(srcArg) ? null 1847 : dir.getFileEncryptionInfo(inode, iip.getPathSnapshotId(), iip); 1848 1849 final LocatedBlocks blocks = blockManager.createLocatedBlocks( 1850 inode.getBlocks(iip.getPathSnapshotId()), fileSize, 1851 isUc, offset, length, needBlockToken, iip.isSnapshot(), feInfo); 1852 1853 // Set caching information for the located blocks. 1854 for (LocatedBlock lb : blocks.getLocatedBlocks()) { 1855 cacheManager.setCachedLocations(lb); 1856 } 1857 1858 final long now = now(); 1859 boolean updateAccessTime = isAccessTimeSupported() && !isInSafeMode() 1860 && !iip.isSnapshot() 1861 && now > inode.getAccessTime() + getAccessTimePrecision(); 1862 return new GetBlockLocationsResult(updateAccessTime, blocks); 1863 } 1864 1865 /** 1866 * Moves all the blocks from {@code srcs} and appends them to {@code target} 1867 * To avoid rollbacks we will verify validity of ALL of the args 1868 * before we start actual move. 1869 * 1870 * This does not support ".inodes" relative path 1871 * @param target target to concat into 1872 * @param srcs file that will be concatenated 1873 * @throws IOException on error 1874 */ 1875 void concat(String target, String [] srcs, boolean logRetryCache) 1876 throws IOException { 1877 checkOperation(OperationCategory.WRITE); 1878 waitForLoadingFSImage(); 1879 HdfsFileStatus stat = null; 1880 boolean success = false; 1881 writeLock(); 1882 try { 1883 checkOperation(OperationCategory.WRITE); 1884 checkNameNodeSafeMode("Cannot concat " + target); 1885 stat = FSDirConcatOp.concat(dir, target, srcs, logRetryCache); 1886 success = true; 1887 } finally { 1888 writeUnlock(); 1889 if (success) { 1890 getEditLog().logSync(); 1891 } 1892 logAuditEvent(success, "concat", Arrays.toString(srcs), target, stat); 1893 } 1894 } 1895 1896 /** 1897 * stores the modification and access time for this inode. 1898 * The access time is precise up to an hour. The transaction, if needed, is 1899 * written to the edits log but is not flushed. 1900 */ 1901 void setTimes(String src, long mtime, long atime) throws IOException { 1902 HdfsFileStatus auditStat; 1903 checkOperation(OperationCategory.WRITE); 1904 writeLock(); 1905 try { 1906 checkOperation(OperationCategory.WRITE); 1907 checkNameNodeSafeMode("Cannot set times " + src); 1908 auditStat = FSDirAttrOp.setTimes(dir, src, mtime, atime); 1909 } catch (AccessControlException e) { 1910 logAuditEvent(false, "setTimes", src); 1911 throw e; 1912 } finally { 1913 writeUnlock(); 1914 } 1915 getEditLog().logSync(); 1916 logAuditEvent(true, "setTimes", src, null, auditStat); 1917 } 1918 1919 /** 1920 * Create a symbolic link. 1921 */ 1922 @SuppressWarnings("deprecation") 1923 void createSymlink(String target, String link, 1924 PermissionStatus dirPerms, boolean createParent, boolean logRetryCache) 1925 throws IOException { 1926 if (!FileSystem.areSymlinksEnabled()) { 1927 throw new UnsupportedOperationException("Symlinks not supported"); 1928 } 1929 HdfsFileStatus auditStat = null; 1930 checkOperation(OperationCategory.WRITE); 1931 writeLock(); 1932 try { 1933 checkOperation(OperationCategory.WRITE); 1934 checkNameNodeSafeMode("Cannot create symlink " + link); 1935 auditStat = FSDirSymlinkOp.createSymlinkInt(this, target, link, dirPerms, 1936 createParent, logRetryCache); 1937 } catch (AccessControlException e) { 1938 logAuditEvent(false, "createSymlink", link, target, null); 1939 throw e; 1940 } finally { 1941 writeUnlock(); 1942 } 1943 getEditLog().logSync(); 1944 logAuditEvent(true, "createSymlink", link, target, auditStat); 1945 } 1946 1947 /** 1948 * Set replication for an existing file. 1949 * 1950 * The NameNode sets new replication and schedules either replication of 1951 * under-replicated data blocks or removal of the excessive block copies 1952 * if the blocks are over-replicated. 1953 * 1954 * @see ClientProtocol#setReplication(String, short) 1955 * @param src file name 1956 * @param replication new replication 1957 * @return true if successful; 1958 * false if file does not exist or is a directory 1959 */ 1960 boolean setReplication(final String src, final short replication) 1961 throws IOException { 1962 boolean success = false; 1963 waitForLoadingFSImage(); 1964 checkOperation(OperationCategory.WRITE); 1965 writeLock(); 1966 try { 1967 checkOperation(OperationCategory.WRITE); 1968 checkNameNodeSafeMode("Cannot set replication for " + src); 1969 success = FSDirAttrOp.setReplication(dir, blockManager, src, replication); 1970 } catch (AccessControlException e) { 1971 logAuditEvent(false, "setReplication", src); 1972 throw e; 1973 } finally { 1974 writeUnlock(); 1975 } 1976 if (success) { 1977 getEditLog().logSync(); 1978 logAuditEvent(true, "setReplication", src); 1979 } 1980 return success; 1981 } 1982 1983 /** 1984 * Truncate file to a lower length. 1985 * Truncate cannot be reverted / recovered from as it causes data loss. 1986 * Truncation at block boundary is atomic, otherwise it requires 1987 * block recovery to truncate the last block of the file. 1988 * 1989 * @return true if client does not need to wait for block recovery, 1990 * false if client needs to wait for block recovery. 1991 */ 1992 boolean truncate(String src, long newLength, 1993 String clientName, String clientMachine, 1994 long mtime) 1995 throws IOException, UnresolvedLinkException { 1996 boolean ret; 1997 try { 1998 ret = truncateInt(src, newLength, clientName, clientMachine, mtime); 1999 } catch (AccessControlException e) { 2000 logAuditEvent(false, "truncate", src); 2001 throw e; 2002 } 2003 return ret; 2004 } 2005 2006 boolean truncateInt(String srcArg, long newLength, 2007 String clientName, String clientMachine, 2008 long mtime) 2009 throws IOException, UnresolvedLinkException { 2010 String src = srcArg; 2011 NameNode.stateChangeLog.debug( 2012 "DIR* NameSystem.truncate: src={} newLength={}", src, newLength); 2013 if (newLength < 0) { 2014 throw new HadoopIllegalArgumentException( 2015 "Cannot truncate to a negative file size: " + newLength + "."); 2016 } 2017 HdfsFileStatus stat = null; 2018 FSPermissionChecker pc = getPermissionChecker(); 2019 checkOperation(OperationCategory.WRITE); 2020 boolean res; 2021 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src); 2022 writeLock(); 2023 BlocksMapUpdateInfo toRemoveBlocks = new BlocksMapUpdateInfo(); 2024 try { 2025 checkOperation(OperationCategory.WRITE); 2026 checkNameNodeSafeMode("Cannot truncate for " + src); 2027 src = dir.resolvePath(pc, src, pathComponents); 2028 res = truncateInternal(src, newLength, clientName, 2029 clientMachine, mtime, pc, toRemoveBlocks); 2030 stat = dir.getAuditFileInfo(dir.getINodesInPath4Write(src, false)); 2031 } finally { 2032 writeUnlock(); 2033 } 2034 getEditLog().logSync(); 2035 if (!toRemoveBlocks.getToDeleteList().isEmpty()) { 2036 removeBlocks(toRemoveBlocks); 2037 toRemoveBlocks.clear(); 2038 } 2039 logAuditEvent(true, "truncate", src, null, stat); 2040 return res; 2041 } 2042 2043 /** 2044 * Truncate a file to a given size 2045 * Update the count at each ancestor directory with quota 2046 */ 2047 boolean truncateInternal(String src, long newLength, 2048 String clientName, String clientMachine, 2049 long mtime, FSPermissionChecker pc, 2050 BlocksMapUpdateInfo toRemoveBlocks) 2051 throws IOException, UnresolvedLinkException { 2052 assert hasWriteLock(); 2053 INodesInPath iip = dir.getINodesInPath4Write(src, true); 2054 if (isPermissionEnabled) { 2055 dir.checkPathAccess(pc, iip, FsAction.WRITE); 2056 } 2057 INodeFile file = INodeFile.valueOf(iip.getLastINode(), src); 2058 final BlockStoragePolicy lpPolicy = 2059 blockManager.getStoragePolicy("LAZY_PERSIST"); 2060 2061 if (lpPolicy != null && 2062 lpPolicy.getId() == file.getStoragePolicyID()) { 2063 throw new UnsupportedOperationException( 2064 "Cannot truncate lazy persist file " + src); 2065 } 2066 2067 // Check if the file is already being truncated with the same length 2068 final BlockInfoContiguous last = file.getLastBlock(); 2069 if (last != null && last.getBlockUCState() == BlockUCState.UNDER_RECOVERY) { 2070 final Block truncateBlock 2071 = ((BlockInfoContiguousUnderConstruction)last).getTruncateBlock(); 2072 if (truncateBlock != null) { 2073 final long truncateLength = file.computeFileSize(false, false) 2074 + truncateBlock.getNumBytes(); 2075 if (newLength == truncateLength) { 2076 return false; 2077 } 2078 } 2079 } 2080 2081 // Opening an existing file for truncate. May need lease recovery. 2082 recoverLeaseInternal(RecoverLeaseOp.TRUNCATE_FILE, 2083 iip, src, clientName, clientMachine, false); 2084 // Truncate length check. 2085 long oldLength = file.computeFileSize(); 2086 if(oldLength == newLength) { 2087 return true; 2088 } 2089 if(oldLength < newLength) { 2090 throw new HadoopIllegalArgumentException( 2091 "Cannot truncate to a larger file size. Current size: " + oldLength + 2092 ", truncate size: " + newLength + "."); 2093 } 2094 // Perform INodeFile truncation. 2095 final QuotaCounts delta = new QuotaCounts.Builder().build(); 2096 boolean onBlockBoundary = dir.truncate(iip, newLength, toRemoveBlocks, 2097 mtime, delta); 2098 Block truncateBlock = null; 2099 if(!onBlockBoundary) { 2100 // Open file for write, but don't log into edits 2101 long lastBlockDelta = file.computeFileSize() - newLength; 2102 assert lastBlockDelta > 0 : "delta is 0 only if on block bounday"; 2103 truncateBlock = prepareFileForTruncate(iip, clientName, clientMachine, 2104 lastBlockDelta, null); 2105 } 2106 2107 // update the quota: use the preferred block size for UC block 2108 dir.writeLock(); 2109 try { 2110 dir.updateCountNoQuotaCheck(iip, iip.length() - 1, delta); 2111 } finally { 2112 dir.writeUnlock(); 2113 } 2114 2115 getEditLog().logTruncate(src, clientName, clientMachine, newLength, mtime, 2116 truncateBlock); 2117 return onBlockBoundary; 2118 } 2119 2120 /** 2121 * Convert current INode to UnderConstruction. 2122 * Recreate lease. 2123 * Create new block for the truncated copy. 2124 * Schedule truncation of the replicas. 2125 * 2126 * @return the returned block will be written to editLog and passed back into 2127 * this method upon loading. 2128 */ 2129 Block prepareFileForTruncate(INodesInPath iip, 2130 String leaseHolder, 2131 String clientMachine, 2132 long lastBlockDelta, 2133 Block newBlock) 2134 throws IOException { 2135 INodeFile file = iip.getLastINode().asFile(); 2136 String src = iip.getPath(); 2137 file.recordModification(iip.getLatestSnapshotId()); 2138 file.toUnderConstruction(leaseHolder, clientMachine); 2139 assert file.isUnderConstruction() : "inode should be under construction."; 2140 leaseManager.addLease( 2141 file.getFileUnderConstructionFeature().getClientName(), src); 2142 boolean shouldRecoverNow = (newBlock == null); 2143 BlockInfoContiguous oldBlock = file.getLastBlock(); 2144 boolean shouldCopyOnTruncate = shouldCopyOnTruncate(file, oldBlock); 2145 if(newBlock == null) { 2146 newBlock = (shouldCopyOnTruncate) ? createNewBlock() : 2147 new Block(oldBlock.getBlockId(), oldBlock.getNumBytes(), 2148 nextGenerationStamp(blockIdManager.isLegacyBlock(oldBlock))); 2149 } 2150 2151 BlockInfoContiguousUnderConstruction truncatedBlockUC; 2152 if(shouldCopyOnTruncate) { 2153 // Add new truncateBlock into blocksMap and 2154 // use oldBlock as a source for copy-on-truncate recovery 2155 truncatedBlockUC = new BlockInfoContiguousUnderConstruction(newBlock, 2156 file.getBlockReplication()); 2157 truncatedBlockUC.setNumBytes(oldBlock.getNumBytes() - lastBlockDelta); 2158 truncatedBlockUC.setTruncateBlock(oldBlock); 2159 file.setLastBlock(truncatedBlockUC, blockManager.getStorages(oldBlock)); 2160 getBlockManager().addBlockCollection(truncatedBlockUC, file); 2161 2162 NameNode.stateChangeLog.debug( 2163 "BLOCK* prepareFileForTruncate: Scheduling copy-on-truncate to new" + 2164 " size {} new block {} old block {}", truncatedBlockUC.getNumBytes(), 2165 newBlock, truncatedBlockUC.getTruncateBlock()); 2166 } else { 2167 // Use new generation stamp for in-place truncate recovery 2168 blockManager.convertLastBlockToUnderConstruction(file, lastBlockDelta); 2169 oldBlock = file.getLastBlock(); 2170 assert !oldBlock.isComplete() : "oldBlock should be under construction"; 2171 truncatedBlockUC = (BlockInfoContiguousUnderConstruction) oldBlock; 2172 truncatedBlockUC.setTruncateBlock(new Block(oldBlock)); 2173 truncatedBlockUC.getTruncateBlock().setNumBytes( 2174 oldBlock.getNumBytes() - lastBlockDelta); 2175 truncatedBlockUC.getTruncateBlock().setGenerationStamp( 2176 newBlock.getGenerationStamp()); 2177 2178 NameNode.stateChangeLog.debug( 2179 "BLOCK* prepareFileForTruncate: {} Scheduling in-place block " + 2180 "truncate to new size {}", 2181 truncatedBlockUC.getTruncateBlock().getNumBytes(), truncatedBlockUC); 2182 } 2183 if (shouldRecoverNow) { 2184 truncatedBlockUC.initializeBlockRecovery(newBlock.getGenerationStamp()); 2185 } 2186 2187 return newBlock; 2188 } 2189 2190 /** 2191 * Defines if a replica needs to be copied on truncate or 2192 * can be truncated in place. 2193 */ 2194 boolean shouldCopyOnTruncate(INodeFile file, BlockInfoContiguous blk) { 2195 if(!isUpgradeFinalized()) { 2196 return true; 2197 } 2198 if (isRollingUpgrade()) { 2199 return true; 2200 } 2201 return file.isBlockInLatestSnapshot(blk); 2202 } 2203 2204 /** 2205 * Set the storage policy for a file or a directory. 2206 * 2207 * @param src file/directory path 2208 * @param policyName storage policy name 2209 */ 2210 void setStoragePolicy(String src, String policyName) throws IOException { 2211 HdfsFileStatus auditStat; 2212 waitForLoadingFSImage(); 2213 checkOperation(OperationCategory.WRITE); 2214 writeLock(); 2215 try { 2216 checkOperation(OperationCategory.WRITE); 2217 checkNameNodeSafeMode("Cannot set storage policy for " + src); 2218 auditStat = FSDirAttrOp.setStoragePolicy( 2219 dir, blockManager, src, policyName); 2220 } catch (AccessControlException e) { 2221 logAuditEvent(false, "setStoragePolicy", src); 2222 throw e; 2223 } finally { 2224 writeUnlock(); 2225 } 2226 getEditLog().logSync(); 2227 logAuditEvent(true, "setStoragePolicy", src, null, auditStat); 2228 } 2229 2230 /** 2231 * @return All the existing block storage policies 2232 */ 2233 BlockStoragePolicy[] getStoragePolicies() throws IOException { 2234 checkOperation(OperationCategory.READ); 2235 waitForLoadingFSImage(); 2236 readLock(); 2237 try { 2238 checkOperation(OperationCategory.READ); 2239 return FSDirAttrOp.getStoragePolicies(blockManager); 2240 } finally { 2241 readUnlock(); 2242 } 2243 } 2244 2245 long getPreferredBlockSize(String src) throws IOException { 2246 checkOperation(OperationCategory.READ); 2247 readLock(); 2248 try { 2249 checkOperation(OperationCategory.READ); 2250 return FSDirAttrOp.getPreferredBlockSize(dir, src); 2251 } finally { 2252 readUnlock(); 2253 } 2254 } 2255 2256 /** 2257 * If the file is within an encryption zone, select the appropriate 2258 * CryptoProtocolVersion from the list provided by the client. Since the 2259 * client may be newer, we need to handle unknown versions. 2260 * 2261 * @param zone EncryptionZone of the file 2262 * @param supportedVersions List of supported protocol versions 2263 * @return chosen protocol version 2264 * @throws IOException 2265 */ 2266 private CryptoProtocolVersion chooseProtocolVersion(EncryptionZone zone, 2267 CryptoProtocolVersion[] supportedVersions) 2268 throws UnknownCryptoProtocolVersionException, UnresolvedLinkException, 2269 SnapshotAccessControlException { 2270 Preconditions.checkNotNull(zone); 2271 Preconditions.checkNotNull(supportedVersions); 2272 // Right now, we only support a single protocol version, 2273 // so simply look for it in the list of provided options 2274 final CryptoProtocolVersion required = zone.getVersion(); 2275 2276 for (CryptoProtocolVersion c : supportedVersions) { 2277 if (c.equals(CryptoProtocolVersion.UNKNOWN)) { 2278 if (LOG.isDebugEnabled()) { 2279 LOG.debug("Ignoring unknown CryptoProtocolVersion provided by " + 2280 "client: " + c.getUnknownValue()); 2281 } 2282 continue; 2283 } 2284 if (c.equals(required)) { 2285 return c; 2286 } 2287 } 2288 throw new UnknownCryptoProtocolVersionException( 2289 "No crypto protocol versions provided by the client are supported." 2290 + " Client provided: " + Arrays.toString(supportedVersions) 2291 + " NameNode supports: " + Arrays.toString(CryptoProtocolVersion 2292 .values())); 2293 } 2294 2295 /** 2296 * Invoke KeyProvider APIs to generate an encrypted data encryption key for an 2297 * encryption zone. Should not be called with any locks held. 2298 * 2299 * @param ezKeyName key name of an encryption zone 2300 * @return New EDEK, or null if ezKeyName is null 2301 * @throws IOException 2302 */ 2303 private EncryptedKeyVersion generateEncryptedDataEncryptionKey(String 2304 ezKeyName) throws IOException { 2305 if (ezKeyName == null) { 2306 return null; 2307 } 2308 EncryptedKeyVersion edek = null; 2309 try { 2310 edek = provider.generateEncryptedKey(ezKeyName); 2311 } catch (GeneralSecurityException e) { 2312 throw new IOException(e); 2313 } 2314 Preconditions.checkNotNull(edek); 2315 return edek; 2316 } 2317 2318 /** 2319 * Create a new file entry in the namespace. 2320 * 2321 * For description of parameters and exceptions thrown see 2322 * {@link ClientProtocol#create}, except it returns valid file status upon 2323 * success 2324 */ 2325 HdfsFileStatus startFile(String src, PermissionStatus permissions, 2326 String holder, String clientMachine, EnumSet<CreateFlag> flag, 2327 boolean createParent, short replication, long blockSize, 2328 CryptoProtocolVersion[] supportedVersions, boolean logRetryCache) 2329 throws AccessControlException, SafeModeException, 2330 FileAlreadyExistsException, UnresolvedLinkException, 2331 FileNotFoundException, ParentNotDirectoryException, IOException { 2332 2333 HdfsFileStatus status = null; 2334 try { 2335 status = startFileInt(src, permissions, holder, clientMachine, flag, 2336 createParent, replication, blockSize, supportedVersions, 2337 logRetryCache); 2338 } catch (AccessControlException e) { 2339 logAuditEvent(false, "create", src); 2340 throw e; 2341 } 2342 return status; 2343 } 2344 2345 private HdfsFileStatus startFileInt(final String srcArg, 2346 PermissionStatus permissions, String holder, String clientMachine, 2347 EnumSet<CreateFlag> flag, boolean createParent, short replication, 2348 long blockSize, CryptoProtocolVersion[] supportedVersions, 2349 boolean logRetryCache) 2350 throws AccessControlException, SafeModeException, 2351 FileAlreadyExistsException, UnresolvedLinkException, 2352 FileNotFoundException, ParentNotDirectoryException, IOException { 2353 String src = srcArg; 2354 if (NameNode.stateChangeLog.isDebugEnabled()) { 2355 StringBuilder builder = new StringBuilder(); 2356 builder.append("DIR* NameSystem.startFile: src=" + src 2357 + ", holder=" + holder 2358 + ", clientMachine=" + clientMachine 2359 + ", createParent=" + createParent 2360 + ", replication=" + replication 2361 + ", createFlag=" + flag.toString() 2362 + ", blockSize=" + blockSize); 2363 builder.append(", supportedVersions="); 2364 if (supportedVersions != null) { 2365 builder.append(Arrays.toString(supportedVersions)); 2366 } else { 2367 builder.append("null"); 2368 } 2369 NameNode.stateChangeLog.debug(builder.toString()); 2370 } 2371 if (!DFSUtil.isValidName(src)) { 2372 throw new InvalidPathException(src); 2373 } 2374 blockManager.verifyReplication(src, replication, clientMachine); 2375 2376 boolean skipSync = false; 2377 HdfsFileStatus stat = null; 2378 FSPermissionChecker pc = getPermissionChecker(); 2379 checkOperation(OperationCategory.WRITE); 2380 if (blockSize < minBlockSize) { 2381 throw new IOException("Specified block size is less than configured" + 2382 " minimum value (" + DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY 2383 + "): " + blockSize + " < " + minBlockSize); 2384 } 2385 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src); 2386 boolean create = flag.contains(CreateFlag.CREATE); 2387 boolean overwrite = flag.contains(CreateFlag.OVERWRITE); 2388 boolean isLazyPersist = flag.contains(CreateFlag.LAZY_PERSIST); 2389 2390 waitForLoadingFSImage(); 2391 2392 /** 2393 * If the file is in an encryption zone, we optimistically create an 2394 * EDEK for the file by calling out to the configured KeyProvider. 2395 * Since this typically involves doing an RPC, we take the readLock 2396 * initially, then drop it to do the RPC. 2397 * 2398 * Since the path can flip-flop between being in an encryption zone and not 2399 * in the meantime, we need to recheck the preconditions when we retake the 2400 * lock to do the create. If the preconditions are not met, we throw a 2401 * special RetryStartFileException to ask the DFSClient to try the create 2402 * again later. 2403 */ 2404 CryptoProtocolVersion protocolVersion = null; 2405 CipherSuite suite = null; 2406 String ezKeyName = null; 2407 EncryptedKeyVersion edek = null; 2408 2409 if (provider != null) { 2410 readLock(); 2411 try { 2412 src = dir.resolvePath(pc, src, pathComponents); 2413 INodesInPath iip = dir.getINodesInPath4Write(src); 2414 // Nothing to do if the path is not within an EZ 2415 final EncryptionZone zone = dir.getEZForPath(iip); 2416 if (zone != null) { 2417 protocolVersion = chooseProtocolVersion(zone, supportedVersions); 2418 suite = zone.getSuite(); 2419 ezKeyName = zone.getKeyName(); 2420 2421 Preconditions.checkNotNull(protocolVersion); 2422 Preconditions.checkNotNull(suite); 2423 Preconditions.checkArgument(!suite.equals(CipherSuite.UNKNOWN), 2424 "Chose an UNKNOWN CipherSuite!"); 2425 Preconditions.checkNotNull(ezKeyName); 2426 } 2427 } finally { 2428 readUnlock(); 2429 } 2430 2431 Preconditions.checkState( 2432 (suite == null && ezKeyName == null) || 2433 (suite != null && ezKeyName != null), 2434 "Both suite and ezKeyName should both be null or not null"); 2435 2436 // Generate EDEK if necessary while not holding the lock 2437 edek = generateEncryptedDataEncryptionKey(ezKeyName); 2438 EncryptionFaultInjector.getInstance().startFileAfterGenerateKey(); 2439 } 2440 2441 // Proceed with the create, using the computed cipher suite and 2442 // generated EDEK 2443 BlocksMapUpdateInfo toRemoveBlocks = null; 2444 writeLock(); 2445 try { 2446 checkOperation(OperationCategory.WRITE); 2447 checkNameNodeSafeMode("Cannot create file" + src); 2448 dir.writeLock(); 2449 try { 2450 src = dir.resolvePath(pc, src, pathComponents); 2451 final INodesInPath iip = dir.getINodesInPath4Write(src); 2452 toRemoveBlocks = startFileInternal( 2453 pc, iip, permissions, holder, 2454 clientMachine, create, overwrite, 2455 createParent, replication, blockSize, 2456 isLazyPersist, suite, protocolVersion, edek, 2457 logRetryCache); 2458 stat = FSDirStatAndListingOp.getFileInfo( 2459 dir, src, false, FSDirectory.isReservedRawName(srcArg), true); 2460 } finally { 2461 dir.writeUnlock(); 2462 } 2463 } catch (StandbyException se) { 2464 skipSync = true; 2465 throw se; 2466 } finally { 2467 writeUnlock(); 2468 // There might be transactions logged while trying to recover the lease. 2469 // They need to be sync'ed even when an exception was thrown. 2470 if (!skipSync) { 2471 getEditLog().logSync(); 2472 if (toRemoveBlocks != null) { 2473 removeBlocks(toRemoveBlocks); 2474 toRemoveBlocks.clear(); 2475 } 2476 } 2477 } 2478 2479 logAuditEvent(true, "create", srcArg, null, stat); 2480 return stat; 2481 } 2482 2483 /** 2484 * Create a new file or overwrite an existing file<br> 2485 * 2486 * Once the file is create the client then allocates a new block with the next 2487 * call using {@link ClientProtocol#addBlock}. 2488 * <p> 2489 * For description of parameters and exceptions thrown see 2490 * {@link ClientProtocol#create} 2491 */ 2492 private BlocksMapUpdateInfo startFileInternal(FSPermissionChecker pc, 2493 INodesInPath iip, PermissionStatus permissions, String holder, 2494 String clientMachine, boolean create, boolean overwrite, 2495 boolean createParent, short replication, long blockSize, 2496 boolean isLazyPersist, CipherSuite suite, CryptoProtocolVersion version, 2497 EncryptedKeyVersion edek, boolean logRetryEntry) 2498 throws IOException { 2499 assert hasWriteLock(); 2500 // Verify that the destination does not exist as a directory already. 2501 final INode inode = iip.getLastINode(); 2502 final String src = iip.getPath(); 2503 if (inode != null && inode.isDirectory()) { 2504 throw new FileAlreadyExistsException(src + 2505 " already exists as a directory"); 2506 } 2507 2508 final INodeFile myFile = INodeFile.valueOf(inode, src, true); 2509 if (isPermissionEnabled) { 2510 if (overwrite && myFile != null) { 2511 dir.checkPathAccess(pc, iip, FsAction.WRITE); 2512 } 2513 /* 2514 * To overwrite existing file, need to check 'w' permission 2515 * of parent (equals to ancestor in this case) 2516 */ 2517 dir.checkAncestorAccess(pc, iip, FsAction.WRITE); 2518 } 2519 if (!createParent) { 2520 dir.verifyParentDir(iip, src); 2521 } 2522 2523 FileEncryptionInfo feInfo = null; 2524 2525 final EncryptionZone zone = dir.getEZForPath(iip); 2526 if (zone != null) { 2527 // The path is now within an EZ, but we're missing encryption parameters 2528 if (suite == null || edek == null) { 2529 throw new RetryStartFileException(); 2530 } 2531 // Path is within an EZ and we have provided encryption parameters. 2532 // Make sure that the generated EDEK matches the settings of the EZ. 2533 final String ezKeyName = zone.getKeyName(); 2534 if (!ezKeyName.equals(edek.getEncryptionKeyName())) { 2535 throw new RetryStartFileException(); 2536 } 2537 feInfo = new FileEncryptionInfo(suite, version, 2538 edek.getEncryptedKeyVersion().getMaterial(), 2539 edek.getEncryptedKeyIv(), 2540 ezKeyName, edek.getEncryptionKeyVersionName()); 2541 } 2542 2543 try { 2544 BlocksMapUpdateInfo toRemoveBlocks = null; 2545 if (myFile == null) { 2546 if (!create) { 2547 throw new FileNotFoundException("Can't overwrite non-existent " + 2548 src + " for client " + clientMachine); 2549 } 2550 } else { 2551 if (overwrite) { 2552 toRemoveBlocks = new BlocksMapUpdateInfo(); 2553 List<INode> toRemoveINodes = new ChunkedArrayList<INode>(); 2554 long ret = FSDirDeleteOp.delete(dir, iip, toRemoveBlocks, 2555 toRemoveINodes, now()); 2556 if (ret >= 0) { 2557 iip = INodesInPath.replace(iip, iip.length() - 1, null); 2558 FSDirDeleteOp.incrDeletedFileCount(ret); 2559 removeLeasesAndINodes(src, toRemoveINodes, true); 2560 } 2561 } else { 2562 // If lease soft limit time is expired, recover the lease 2563 recoverLeaseInternal(RecoverLeaseOp.CREATE_FILE, 2564 iip, src, holder, clientMachine, false); 2565 throw new FileAlreadyExistsException(src + " for client " + 2566 clientMachine + " already exists"); 2567 } 2568 } 2569 2570 checkFsObjectLimit(); 2571 INodeFile newNode = null; 2572 2573 // Always do an implicit mkdirs for parent directory tree. 2574 Map.Entry<INodesInPath, String> parent = FSDirMkdirOp 2575 .createAncestorDirectories(dir, iip, permissions); 2576 if (parent != null) { 2577 iip = dir.addFile(parent.getKey(), parent.getValue(), permissions, 2578 replication, blockSize, holder, clientMachine); 2579 newNode = iip != null ? iip.getLastINode().asFile() : null; 2580 } 2581 2582 if (newNode == null) { 2583 throw new IOException("Unable to add " + src + " to namespace"); 2584 } 2585 leaseManager.addLease(newNode.getFileUnderConstructionFeature() 2586 .getClientName(), src); 2587 2588 // Set encryption attributes if necessary 2589 if (feInfo != null) { 2590 dir.setFileEncryptionInfo(src, feInfo); 2591 newNode = dir.getInode(newNode.getId()).asFile(); 2592 } 2593 2594 setNewINodeStoragePolicy(newNode, iip, isLazyPersist); 2595 2596 // record file record in log, record new generation stamp 2597 getEditLog().logOpenFile(src, newNode, overwrite, logRetryEntry); 2598 NameNode.stateChangeLog.debug("DIR* NameSystem.startFile: added {}" + 2599 " inode {} holder {}", src, newNode.getId(), holder); 2600 return toRemoveBlocks; 2601 } catch (IOException ie) { 2602 NameNode.stateChangeLog.warn("DIR* NameSystem.startFile: " + src + " " + 2603 ie.getMessage()); 2604 throw ie; 2605 } 2606 } 2607 2608 private void setNewINodeStoragePolicy(INodeFile inode, 2609 INodesInPath iip, 2610 boolean isLazyPersist) 2611 throws IOException { 2612 2613 if (isLazyPersist) { 2614 BlockStoragePolicy lpPolicy = 2615 blockManager.getStoragePolicy("LAZY_PERSIST"); 2616 2617 // Set LAZY_PERSIST storage policy if the flag was passed to 2618 // CreateFile. 2619 if (lpPolicy == null) { 2620 throw new HadoopIllegalArgumentException( 2621 "The LAZY_PERSIST storage policy has been disabled " + 2622 "by the administrator."); 2623 } 2624 inode.setStoragePolicyID(lpPolicy.getId(), 2625 iip.getLatestSnapshotId()); 2626 } else { 2627 BlockStoragePolicy effectivePolicy = 2628 blockManager.getStoragePolicy(inode.getStoragePolicyID()); 2629 2630 if (effectivePolicy != null && 2631 effectivePolicy.isCopyOnCreateFile()) { 2632 // Copy effective policy from ancestor directory to current file. 2633 inode.setStoragePolicyID(effectivePolicy.getId(), 2634 iip.getLatestSnapshotId()); 2635 } 2636 } 2637 } 2638 2639 /** 2640 * Append to an existing file for append. 2641 * <p> 2642 * 2643 * The method returns the last block of the file if this is a partial block, 2644 * which can still be used for writing more data. The client uses the returned 2645 * block locations to form the data pipeline for this block.<br> 2646 * The method returns null if the last block is full. The client then 2647 * allocates a new block with the next call using 2648 * {@link ClientProtocol#addBlock}. 2649 * <p> 2650 * 2651 * For description of parameters and exceptions thrown see 2652 * {@link ClientProtocol#append(String, String, EnumSetWritable)} 2653 * 2654 * @return the last block locations if the block is partial or null otherwise 2655 */ 2656 private LocatedBlock appendFileInternal(FSPermissionChecker pc, 2657 INodesInPath iip, String holder, String clientMachine, boolean newBlock, 2658 boolean logRetryCache) throws IOException { 2659 assert hasWriteLock(); 2660 // Verify that the destination does not exist as a directory already. 2661 final INode inode = iip.getLastINode(); 2662 final String src = iip.getPath(); 2663 if (inode != null && inode.isDirectory()) { 2664 throw new FileAlreadyExistsException("Cannot append to directory " + src 2665 + "; already exists as a directory."); 2666 } 2667 if (isPermissionEnabled) { 2668 dir.checkPathAccess(pc, iip, FsAction.WRITE); 2669 } 2670 2671 try { 2672 if (inode == null) { 2673 throw new FileNotFoundException("failed to append to non-existent file " 2674 + src + " for client " + clientMachine); 2675 } 2676 INodeFile myFile = INodeFile.valueOf(inode, src, true); 2677 final BlockStoragePolicy lpPolicy = 2678 blockManager.getStoragePolicy("LAZY_PERSIST"); 2679 if (lpPolicy != null && 2680 lpPolicy.getId() == myFile.getStoragePolicyID()) { 2681 throw new UnsupportedOperationException( 2682 "Cannot append to lazy persist file " + src); 2683 } 2684 // Opening an existing file for append - may need to recover lease. 2685 recoverLeaseInternal(RecoverLeaseOp.APPEND_FILE, 2686 iip, src, holder, clientMachine, false); 2687 2688 final BlockInfoContiguous lastBlock = myFile.getLastBlock(); 2689 // Check that the block has at least minimum replication. 2690 if(lastBlock != null && lastBlock.isComplete() && 2691 !getBlockManager().isSufficientlyReplicated(lastBlock)) { 2692 throw new IOException("append: lastBlock=" + lastBlock + 2693 " of src=" + src + " is not sufficiently replicated yet."); 2694 } 2695 return prepareFileForAppend(src, iip, holder, clientMachine, newBlock, 2696 true, logRetryCache); 2697 } catch (IOException ie) { 2698 NameNode.stateChangeLog.warn("DIR* NameSystem.append: " +ie.getMessage()); 2699 throw ie; 2700 } 2701 } 2702 2703 /** 2704 * Convert current node to under construction. 2705 * Recreate in-memory lease record. 2706 * 2707 * @param src path to the file 2708 * @param leaseHolder identifier of the lease holder on this file 2709 * @param clientMachine identifier of the client machine 2710 * @param newBlock if the data is appended to a new block 2711 * @param writeToEditLog whether to persist this change to the edit log 2712 * @param logRetryCache whether to record RPC ids in editlog for retry cache 2713 * rebuilding 2714 * @return the last block locations if the block is partial or null otherwise 2715 * @throws UnresolvedLinkException 2716 * @throws IOException 2717 */ 2718 LocatedBlock prepareFileForAppend(String src, INodesInPath iip, 2719 String leaseHolder, String clientMachine, boolean newBlock, 2720 boolean writeToEditLog, boolean logRetryCache) throws IOException { 2721 final INodeFile file = iip.getLastINode().asFile(); 2722 final QuotaCounts delta = verifyQuotaForUCBlock(file, iip); 2723 2724 file.recordModification(iip.getLatestSnapshotId()); 2725 file.toUnderConstruction(leaseHolder, clientMachine); 2726 2727 leaseManager.addLease( 2728 file.getFileUnderConstructionFeature().getClientName(), src); 2729 2730 LocatedBlock ret = null; 2731 if (!newBlock) { 2732 ret = blockManager.convertLastBlockToUnderConstruction(file, 0); 2733 if (ret != null && delta != null) { 2734 Preconditions.checkState(delta.getStorageSpace() >= 0, 2735 "appending to a block with size larger than the preferred block size"); 2736 dir.writeLock(); 2737 try { 2738 dir.updateCountNoQuotaCheck(iip, iip.length() - 1, delta); 2739 } finally { 2740 dir.writeUnlock(); 2741 } 2742 } 2743 } else { 2744 BlockInfoContiguous lastBlock = file.getLastBlock(); 2745 if (lastBlock != null) { 2746 ExtendedBlock blk = new ExtendedBlock(this.getBlockPoolId(), lastBlock); 2747 ret = new LocatedBlock(blk, new DatanodeInfo[0]); 2748 } 2749 } 2750 2751 if (writeToEditLog) { 2752 getEditLog().logAppendFile(src, file, newBlock, logRetryCache); 2753 } 2754 return ret; 2755 } 2756 2757 /** 2758 * Verify quota when using the preferred block size for UC block. This is 2759 * usually used by append and truncate 2760 * @throws QuotaExceededException when violating the storage quota 2761 * @return expected quota usage update. null means no change or no need to 2762 * update quota usage later 2763 */ 2764 private QuotaCounts verifyQuotaForUCBlock(INodeFile file, INodesInPath iip) 2765 throws QuotaExceededException { 2766 if (!isImageLoaded() || dir.shouldSkipQuotaChecks()) { 2767 // Do not check quota if editlog is still being processed 2768 return null; 2769 } 2770 if (file.getLastBlock() != null) { 2771 final QuotaCounts delta = computeQuotaDeltaForUCBlock(file); 2772 dir.readLock(); 2773 try { 2774 FSDirectory.verifyQuota(iip, iip.length() - 1, delta, null); 2775 return delta; 2776 } finally { 2777 dir.readUnlock(); 2778 } 2779 } 2780 return null; 2781 } 2782 2783 /** Compute quota change for converting a complete block to a UC block */ 2784 private QuotaCounts computeQuotaDeltaForUCBlock(INodeFile file) { 2785 final QuotaCounts delta = new QuotaCounts.Builder().build(); 2786 final BlockInfoContiguous lastBlock = file.getLastBlock(); 2787 if (lastBlock != null) { 2788 final long diff = file.getPreferredBlockSize() - lastBlock.getNumBytes(); 2789 final short repl = file.getBlockReplication(); 2790 delta.addStorageSpace(diff * repl); 2791 final BlockStoragePolicy policy = dir.getBlockStoragePolicySuite() 2792 .getPolicy(file.getStoragePolicyID()); 2793 List<StorageType> types = policy.chooseStorageTypes(repl); 2794 for (StorageType t : types) { 2795 if (t.supportTypeQuota()) { 2796 delta.addTypeSpace(t, diff); 2797 } 2798 } 2799 } 2800 return delta; 2801 } 2802 2803 /** 2804 * Recover lease; 2805 * Immediately revoke the lease of the current lease holder and start lease 2806 * recovery so that the file can be forced to be closed. 2807 * 2808 * @param src the path of the file to start lease recovery 2809 * @param holder the lease holder's name 2810 * @param clientMachine the client machine's name 2811 * @return true if the file is already closed or 2812 * if the lease can be released and the file can be closed. 2813 * @throws IOException 2814 */ 2815 boolean recoverLease(String src, String holder, String clientMachine) 2816 throws IOException { 2817 if (!DFSUtil.isValidName(src)) { 2818 throw new IOException("Invalid file name: " + src); 2819 } 2820 2821 boolean skipSync = false; 2822 FSPermissionChecker pc = getPermissionChecker(); 2823 checkOperation(OperationCategory.WRITE); 2824 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src); 2825 writeLock(); 2826 try { 2827 checkOperation(OperationCategory.WRITE); 2828 checkNameNodeSafeMode("Cannot recover the lease of " + src); 2829 src = dir.resolvePath(pc, src, pathComponents); 2830 final INodesInPath iip = dir.getINodesInPath4Write(src); 2831 final INodeFile inode = INodeFile.valueOf(iip.getLastINode(), src); 2832 if (!inode.isUnderConstruction()) { 2833 return true; 2834 } 2835 if (isPermissionEnabled) { 2836 dir.checkPathAccess(pc, iip, FsAction.WRITE); 2837 } 2838 2839 return recoverLeaseInternal(RecoverLeaseOp.RECOVER_LEASE, 2840 iip, src, holder, clientMachine, true); 2841 } catch (StandbyException se) { 2842 skipSync = true; 2843 throw se; 2844 } finally { 2845 writeUnlock(); 2846 // There might be transactions logged while trying to recover the lease. 2847 // They need to be sync'ed even when an exception was thrown. 2848 if (!skipSync) { 2849 getEditLog().logSync(); 2850 } 2851 } 2852 } 2853 2854 private enum RecoverLeaseOp { 2855 CREATE_FILE, 2856 APPEND_FILE, 2857 TRUNCATE_FILE, 2858 RECOVER_LEASE; 2859 2860 private String getExceptionMessage(String src, String holder, 2861 String clientMachine, String reason) { 2862 return "Failed to " + this + " " + src + " for " + holder + 2863 " on " + clientMachine + " because " + reason; 2864 } 2865 } 2866 2867 boolean recoverLeaseInternal(RecoverLeaseOp op, INodesInPath iip, 2868 String src, String holder, String clientMachine, boolean force) 2869 throws IOException { 2870 assert hasWriteLock(); 2871 INodeFile file = iip.getLastINode().asFile(); 2872 if (file.isUnderConstruction()) { 2873 // 2874 // If the file is under construction , then it must be in our 2875 // leases. Find the appropriate lease record. 2876 // 2877 Lease lease = leaseManager.getLease(holder); 2878 2879 if (!force && lease != null) { 2880 Lease leaseFile = leaseManager.getLeaseByPath(src); 2881 if (leaseFile != null && leaseFile.equals(lease)) { 2882 // We found the lease for this file but the original 2883 // holder is trying to obtain it again. 2884 throw new AlreadyBeingCreatedException( 2885 op.getExceptionMessage(src, holder, clientMachine, 2886 holder + " is already the current lease holder.")); 2887 } 2888 } 2889 // 2890 // Find the original holder. 2891 // 2892 FileUnderConstructionFeature uc = file.getFileUnderConstructionFeature(); 2893 String clientName = uc.getClientName(); 2894 lease = leaseManager.getLease(clientName); 2895 if (lease == null) { 2896 throw new AlreadyBeingCreatedException( 2897 op.getExceptionMessage(src, holder, clientMachine, 2898 "the file is under construction but no leases found.")); 2899 } 2900 if (force) { 2901 // close now: no need to wait for soft lease expiration and 2902 // close only the file src 2903 LOG.info("recoverLease: " + lease + ", src=" + src + 2904 " from client " + clientName); 2905 return internalReleaseLease(lease, src, iip, holder); 2906 } else { 2907 assert lease.getHolder().equals(clientName) : 2908 "Current lease holder " + lease.getHolder() + 2909 " does not match file creator " + clientName; 2910 // 2911 // If the original holder has not renewed in the last SOFTLIMIT 2912 // period, then start lease recovery. 2913 // 2914 if (lease.expiredSoftLimit()) { 2915 LOG.info("startFile: recover " + lease + ", src=" + src + " client " 2916 + clientName); 2917 if (internalReleaseLease(lease, src, iip, null)) { 2918 return true; 2919 } else { 2920 throw new RecoveryInProgressException( 2921 op.getExceptionMessage(src, holder, clientMachine, 2922 "lease recovery is in progress. Try again later.")); 2923 } 2924 } else { 2925 final BlockInfoContiguous lastBlock = file.getLastBlock(); 2926 if (lastBlock != null 2927 && lastBlock.getBlockUCState() == BlockUCState.UNDER_RECOVERY) { 2928 throw new RecoveryInProgressException( 2929 op.getExceptionMessage(src, holder, clientMachine, 2930 "another recovery is in progress by " 2931 + clientName + " on " + uc.getClientMachine())); 2932 } else { 2933 throw new AlreadyBeingCreatedException( 2934 op.getExceptionMessage(src, holder, clientMachine, 2935 "this file lease is currently owned by " 2936 + clientName + " on " + uc.getClientMachine())); 2937 } 2938 } 2939 } 2940 } else { 2941 return true; 2942 } 2943 } 2944 2945 /** 2946 * Append to an existing file in the namespace. 2947 */ 2948 LastBlockWithStatus appendFile(String src, String holder, 2949 String clientMachine, EnumSet<CreateFlag> flag, boolean logRetryCache) 2950 throws IOException { 2951 try { 2952 return appendFileInt(src, holder, clientMachine, 2953 flag.contains(CreateFlag.NEW_BLOCK), logRetryCache); 2954 } catch (AccessControlException e) { 2955 logAuditEvent(false, "append", src); 2956 throw e; 2957 } 2958 } 2959 2960 private LastBlockWithStatus appendFileInt(final String srcArg, String holder, 2961 String clientMachine, boolean newBlock, boolean logRetryCache) 2962 throws IOException { 2963 String src = srcArg; 2964 NameNode.stateChangeLog.debug( 2965 "DIR* NameSystem.appendFile: src={}, holder={}, clientMachine={}", 2966 src, holder, clientMachine); 2967 boolean skipSync = false; 2968 if (!supportAppends) { 2969 throw new UnsupportedOperationException( 2970 "Append is not enabled on this NameNode. Use the " + 2971 DFS_SUPPORT_APPEND_KEY + " configuration option to enable it."); 2972 } 2973 2974 LocatedBlock lb = null; 2975 HdfsFileStatus stat = null; 2976 FSPermissionChecker pc = getPermissionChecker(); 2977 checkOperation(OperationCategory.WRITE); 2978 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src); 2979 writeLock(); 2980 try { 2981 checkOperation(OperationCategory.WRITE); 2982 checkNameNodeSafeMode("Cannot append to file" + src); 2983 src = dir.resolvePath(pc, src, pathComponents); 2984 final INodesInPath iip = dir.getINodesInPath4Write(src); 2985 lb = appendFileInternal(pc, iip, holder, clientMachine, newBlock, 2986 logRetryCache); 2987 stat = FSDirStatAndListingOp.getFileInfo(dir, src, false, 2988 FSDirectory.isReservedRawName(srcArg), true); 2989 } catch (StandbyException se) { 2990 skipSync = true; 2991 throw se; 2992 } finally { 2993 writeUnlock(); 2994 // There might be transactions logged while trying to recover the lease. 2995 // They need to be sync'ed even when an exception was thrown. 2996 if (!skipSync) { 2997 getEditLog().logSync(); 2998 } 2999 } 3000 if (lb != null) { 3001 NameNode.stateChangeLog.debug( 3002 "DIR* NameSystem.appendFile: file {} for {} at {} block {} block" + 3003 " size {}", src, holder, clientMachine, lb.getBlock(), 3004 lb.getBlock().getNumBytes()); 3005 } 3006 logAuditEvent(true, "append", srcArg); 3007 return new LastBlockWithStatus(lb, stat); 3008 } 3009 3010 ExtendedBlock getExtendedBlock(Block blk) { 3011 return new ExtendedBlock(blockPoolId, blk); 3012 } 3013 3014 void setBlockPoolId(String bpid) { 3015 blockPoolId = bpid; 3016 blockManager.setBlockPoolId(blockPoolId); 3017 } 3018 3019 /** 3020 * The client would like to obtain an additional block for the indicated 3021 * filename (which is being written-to). Return an array that consists 3022 * of the block, plus a set of machines. The first on this list should 3023 * be where the client writes data. Subsequent items in the list must 3024 * be provided in the connection to the first datanode. 3025 * 3026 * Make sure the previous blocks have been reported by datanodes and 3027 * are replicated. Will return an empty 2-elt array if we want the 3028 * client to "try again later". 3029 */ 3030 LocatedBlock getAdditionalBlock(String src, long fileId, String clientName, 3031 ExtendedBlock previous, Set<Node> excludedNodes, 3032 List<String> favoredNodes) throws IOException { 3033 LocatedBlock[] onRetryBlock = new LocatedBlock[1]; 3034 DatanodeStorageInfo targets[] = getNewBlockTargets(src, fileId, 3035 clientName, previous, excludedNodes, favoredNodes, onRetryBlock); 3036 if (targets == null) { 3037 assert onRetryBlock[0] != null : "Retry block is null"; 3038 // This is a retry. Just return the last block. 3039 return onRetryBlock[0]; 3040 } 3041 LocatedBlock newBlock = storeAllocatedBlock( 3042 src, fileId, clientName, previous, targets); 3043 return newBlock; 3044 } 3045 3046 /** 3047 * Part I of getAdditionalBlock(). 3048 * Analyze the state of the file under read lock to determine if the client 3049 * can add a new block, detect potential retries, lease mismatches, 3050 * and minimal replication of the penultimate block. 3051 * 3052 * Generate target DataNode locations for the new block, 3053 * but do not create the new block yet. 3054 */ 3055 DatanodeStorageInfo[] getNewBlockTargets(String src, long fileId, 3056 String clientName, ExtendedBlock previous, Set<Node> excludedNodes, 3057 List<String> favoredNodes, LocatedBlock[] onRetryBlock) throws IOException { 3058 final long blockSize; 3059 final int replication; 3060 final byte storagePolicyID; 3061 Node clientNode = null; 3062 String clientMachine = null; 3063 3064 NameNode.stateChangeLog.debug("BLOCK* getAdditionalBlock: {} inodeId {}" + 3065 " for {}", src, fileId, clientName); 3066 3067 checkOperation(OperationCategory.READ); 3068 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src); 3069 FSPermissionChecker pc = getPermissionChecker(); 3070 readLock(); 3071 try { 3072 checkOperation(OperationCategory.READ); 3073 src = dir.resolvePath(pc, src, pathComponents); 3074 FileState fileState = analyzeFileState( 3075 src, fileId, clientName, previous, onRetryBlock); 3076 final INodeFile pendingFile = fileState.inode; 3077 // Check if the penultimate block is minimally replicated 3078 if (!checkFileProgress(src, pendingFile, false)) { 3079 throw new NotReplicatedYetException("Not replicated yet: " + src); 3080 } 3081 src = fileState.path; 3082 3083 if (onRetryBlock[0] != null && onRetryBlock[0].getLocations().length > 0) { 3084 // This is a retry. No need to generate new locations. 3085 // Use the last block if it has locations. 3086 return null; 3087 } 3088 if (pendingFile.getBlocks().length >= maxBlocksPerFile) { 3089 throw new IOException("File has reached the limit on maximum number of" 3090 + " blocks (" + DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_KEY 3091 + "): " + pendingFile.getBlocks().length + " >= " 3092 + maxBlocksPerFile); 3093 } 3094 blockSize = pendingFile.getPreferredBlockSize(); 3095 clientMachine = pendingFile.getFileUnderConstructionFeature() 3096 .getClientMachine(); 3097 clientNode = blockManager.getDatanodeManager().getDatanodeByHost( 3098 clientMachine); 3099 replication = pendingFile.getFileReplication(); 3100 storagePolicyID = pendingFile.getStoragePolicyID(); 3101 } finally { 3102 readUnlock(); 3103 } 3104 3105 if (clientNode == null) { 3106 clientNode = getClientNode(clientMachine); 3107 } 3108 3109 // choose targets for the new block to be allocated. 3110 return getBlockManager().chooseTarget4NewBlock( 3111 src, replication, clientNode, excludedNodes, blockSize, favoredNodes, 3112 storagePolicyID); 3113 } 3114 3115 /** 3116 * Part II of getAdditionalBlock(). 3117 * Should repeat the same analysis of the file state as in Part 1, 3118 * but under the write lock. 3119 * If the conditions still hold, then allocate a new block with 3120 * the new targets, add it to the INode and to the BlocksMap. 3121 */ 3122 LocatedBlock storeAllocatedBlock(String src, long fileId, String clientName, 3123 ExtendedBlock previous, DatanodeStorageInfo[] targets) throws IOException { 3124 Block newBlock = null; 3125 long offset; 3126 checkOperation(OperationCategory.WRITE); 3127 waitForLoadingFSImage(); 3128 writeLock(); 3129 try { 3130 checkOperation(OperationCategory.WRITE); 3131 // Run the full analysis again, since things could have changed 3132 // while chooseTarget() was executing. 3133 LocatedBlock[] onRetryBlock = new LocatedBlock[1]; 3134 FileState fileState = 3135 analyzeFileState(src, fileId, clientName, previous, onRetryBlock); 3136 final INodeFile pendingFile = fileState.inode; 3137 src = fileState.path; 3138 3139 if (onRetryBlock[0] != null) { 3140 if (onRetryBlock[0].getLocations().length > 0) { 3141 // This is a retry. Just return the last block if having locations. 3142 return onRetryBlock[0]; 3143 } else { 3144 // add new chosen targets to already allocated block and return 3145 BlockInfoContiguous lastBlockInFile = pendingFile.getLastBlock(); 3146 ((BlockInfoContiguousUnderConstruction) lastBlockInFile) 3147 .setExpectedLocations(targets); 3148 offset = pendingFile.computeFileSize(); 3149 return makeLocatedBlock(lastBlockInFile, targets, offset); 3150 } 3151 } 3152 3153 // commit the last block and complete it if it has minimum replicas 3154 commitOrCompleteLastBlock(pendingFile, fileState.iip, 3155 ExtendedBlock.getLocalBlock(previous)); 3156 3157 // allocate new block, record block locations in INode. 3158 newBlock = createNewBlock(); 3159 INodesInPath inodesInPath = INodesInPath.fromINode(pendingFile); 3160 saveAllocatedBlock(src, inodesInPath, newBlock, targets); 3161 3162 persistNewBlock(src, pendingFile); 3163 offset = pendingFile.computeFileSize(); 3164 } finally { 3165 writeUnlock(); 3166 } 3167 getEditLog().logSync(); 3168 3169 // Return located block 3170 return makeLocatedBlock(newBlock, targets, offset); 3171 } 3172 3173 /* 3174 * Resolve clientmachine address to get a network location path 3175 */ 3176 private Node getClientNode(String clientMachine) { 3177 List<String> hosts = new ArrayList<String>(1); 3178 hosts.add(clientMachine); 3179 List<String> rName = getBlockManager().getDatanodeManager() 3180 .resolveNetworkLocation(hosts); 3181 Node clientNode = null; 3182 if (rName != null) { 3183 // Able to resolve clientMachine mapping. 3184 // Create a temp node to findout the rack local nodes 3185 clientNode = new NodeBase(rName.get(0) + NodeBase.PATH_SEPARATOR_STR 3186 + clientMachine); 3187 } 3188 return clientNode; 3189 } 3190 3191 static class FileState { 3192 public final INodeFile inode; 3193 public final String path; 3194 public final INodesInPath iip; 3195 3196 public FileState(INodeFile inode, String fullPath, INodesInPath iip) { 3197 this.inode = inode; 3198 this.path = fullPath; 3199 this.iip = iip; 3200 } 3201 } 3202 3203 FileState analyzeFileState(String src, 3204 long fileId, 3205 String clientName, 3206 ExtendedBlock previous, 3207 LocatedBlock[] onRetryBlock) 3208 throws IOException { 3209 assert hasReadLock(); 3210 3211 checkBlock(previous); 3212 onRetryBlock[0] = null; 3213 checkNameNodeSafeMode("Cannot add block to " + src); 3214 3215 // have we exceeded the configured limit of fs objects. 3216 checkFsObjectLimit(); 3217 3218 Block previousBlock = ExtendedBlock.getLocalBlock(previous); 3219 final INode inode; 3220 final INodesInPath iip; 3221 if (fileId == INodeId.GRANDFATHER_INODE_ID) { 3222 // Older clients may not have given us an inode ID to work with. 3223 // In this case, we have to try to resolve the path and hope it 3224 // hasn't changed or been deleted since the file was opened for write. 3225 iip = dir.getINodesInPath4Write(src); 3226 inode = iip.getLastINode(); 3227 } else { 3228 // Newer clients pass the inode ID, so we can just get the inode 3229 // directly. 3230 inode = dir.getInode(fileId); 3231 iip = INodesInPath.fromINode(inode); 3232 if (inode != null) { 3233 src = iip.getPath(); 3234 } 3235 } 3236 final INodeFile pendingFile = checkLease(src, clientName, inode, fileId); 3237 BlockInfoContiguous lastBlockInFile = pendingFile.getLastBlock(); 3238 if (!Block.matchingIdAndGenStamp(previousBlock, lastBlockInFile)) { 3239 // The block that the client claims is the current last block 3240 // doesn't match up with what we think is the last block. There are 3241 // four possibilities: 3242 // 1) This is the first block allocation of an append() pipeline 3243 // which started appending exactly at or exceeding the block boundary. 3244 // In this case, the client isn't passed the previous block, 3245 // so it makes the allocateBlock() call with previous=null. 3246 // We can distinguish this since the last block of the file 3247 // will be exactly a full block. 3248 // 2) This is a retry from a client that missed the response of a 3249 // prior getAdditionalBlock() call, perhaps because of a network 3250 // timeout, or because of an HA failover. In that case, we know 3251 // by the fact that the client is re-issuing the RPC that it 3252 // never began to write to the old block. Hence it is safe to 3253 // to return the existing block. 3254 // 3) This is an entirely bogus request/bug -- we should error out 3255 // rather than potentially appending a new block with an empty 3256 // one in the middle, etc 3257 // 4) This is a retry from a client that timed out while 3258 // the prior getAdditionalBlock() is still being processed, 3259 // currently working on chooseTarget(). 3260 // There are no means to distinguish between the first and 3261 // the second attempts in Part I, because the first one hasn't 3262 // changed the namesystem state yet. 3263 // We run this analysis again in Part II where case 4 is impossible. 3264 3265 BlockInfoContiguous penultimateBlock = pendingFile.getPenultimateBlock(); 3266 if (previous == null && 3267 lastBlockInFile != null && 3268 lastBlockInFile.getNumBytes() >= pendingFile.getPreferredBlockSize() && 3269 lastBlockInFile.isComplete()) { 3270 // Case 1 3271 NameNode.stateChangeLog.debug( 3272 "BLOCK* NameSystem.allocateBlock: handling block allocation" + 3273 " writing to a file with a complete previous block: src={}" + 3274 " lastBlock={}", src, lastBlockInFile); 3275 } else if (Block.matchingIdAndGenStamp(penultimateBlock, previousBlock)) { 3276 if (lastBlockInFile.getNumBytes() != 0) { 3277 throw new IOException( 3278 "Request looked like a retry to allocate block " + 3279 lastBlockInFile + " but it already contains " + 3280 lastBlockInFile.getNumBytes() + " bytes"); 3281 } 3282 3283 // Case 2 3284 // Return the last block. 3285 NameNode.stateChangeLog.info("BLOCK* allocateBlock: " + 3286 "caught retry for allocation of a new block in " + 3287 src + ". Returning previously allocated block " + lastBlockInFile); 3288 long offset = pendingFile.computeFileSize(); 3289 onRetryBlock[0] = makeLocatedBlock(lastBlockInFile, 3290 ((BlockInfoContiguousUnderConstruction)lastBlockInFile).getExpectedStorageLocations(), 3291 offset); 3292 return new FileState(pendingFile, src, iip); 3293 } else { 3294 // Case 3 3295 throw new IOException("Cannot allocate block in " + src + ": " + 3296 "passed 'previous' block " + previous + " does not match actual " + 3297 "last block in file " + lastBlockInFile); 3298 } 3299 } 3300 return new FileState(pendingFile, src, iip); 3301 } 3302 3303 LocatedBlock makeLocatedBlock(Block blk, DatanodeStorageInfo[] locs, 3304 long offset) throws IOException { 3305 LocatedBlock lBlk = new LocatedBlock( 3306 getExtendedBlock(blk), locs, offset, false); 3307 getBlockManager().setBlockToken( 3308 lBlk, BlockTokenSecretManager.AccessMode.WRITE); 3309 return lBlk; 3310 } 3311 3312 /** @see ClientProtocol#getAdditionalDatanode */ 3313 LocatedBlock getAdditionalDatanode(String src, long fileId, 3314 final ExtendedBlock blk, final DatanodeInfo[] existings, 3315 final String[] storageIDs, 3316 final Set<Node> excludes, 3317 final int numAdditionalNodes, final String clientName 3318 ) throws IOException { 3319 //check if the feature is enabled 3320 dtpReplaceDatanodeOnFailure.checkEnabled(); 3321 3322 Node clientnode = null; 3323 String clientMachine; 3324 final long preferredblocksize; 3325 final byte storagePolicyID; 3326 final List<DatanodeStorageInfo> chosen; 3327 checkOperation(OperationCategory.READ); 3328 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src); 3329 FSPermissionChecker pc = getPermissionChecker(); 3330 readLock(); 3331 try { 3332 checkOperation(OperationCategory.READ); 3333 //check safe mode 3334 checkNameNodeSafeMode("Cannot add datanode; src=" + src + ", blk=" + blk); 3335 src = dir.resolvePath(pc, src, pathComponents); 3336 3337 //check lease 3338 final INode inode; 3339 if (fileId == INodeId.GRANDFATHER_INODE_ID) { 3340 // Older clients may not have given us an inode ID to work with. 3341 // In this case, we have to try to resolve the path and hope it 3342 // hasn't changed or been deleted since the file was opened for write. 3343 inode = dir.getINode(src); 3344 } else { 3345 inode = dir.getInode(fileId); 3346 if (inode != null) src = inode.getFullPathName(); 3347 } 3348 final INodeFile file = checkLease(src, clientName, inode, fileId); 3349 clientMachine = file.getFileUnderConstructionFeature().getClientMachine(); 3350 clientnode = blockManager.getDatanodeManager().getDatanodeByHost(clientMachine); 3351 preferredblocksize = file.getPreferredBlockSize(); 3352 storagePolicyID = file.getStoragePolicyID(); 3353 3354 //find datanode storages 3355 final DatanodeManager dm = blockManager.getDatanodeManager(); 3356 chosen = Arrays.asList(dm.getDatanodeStorageInfos(existings, storageIDs)); 3357 } finally { 3358 readUnlock(); 3359 } 3360 3361 if (clientnode == null) { 3362 clientnode = getClientNode(clientMachine); 3363 } 3364 3365 // choose new datanodes. 3366 final DatanodeStorageInfo[] targets = blockManager.chooseTarget4AdditionalDatanode( 3367 src, numAdditionalNodes, clientnode, chosen, 3368 excludes, preferredblocksize, storagePolicyID); 3369 final LocatedBlock lb = new LocatedBlock(blk, targets); 3370 blockManager.setBlockToken(lb, AccessMode.COPY); 3371 return lb; 3372 } 3373 3374 /** 3375 * The client would like to let go of the given block 3376 */ 3377 boolean abandonBlock(ExtendedBlock b, long fileId, String src, String holder) 3378 throws IOException { 3379 NameNode.stateChangeLog.debug( 3380 "BLOCK* NameSystem.abandonBlock: {} of file {}", b, src); 3381 checkOperation(OperationCategory.WRITE); 3382 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src); 3383 FSPermissionChecker pc = getPermissionChecker(); 3384 waitForLoadingFSImage(); 3385 writeLock(); 3386 try { 3387 checkOperation(OperationCategory.WRITE); 3388 checkNameNodeSafeMode("Cannot abandon block " + b + " for file" + src); 3389 src = dir.resolvePath(pc, src, pathComponents); 3390 3391 final INode inode; 3392 final INodesInPath iip; 3393 if (fileId == INodeId.GRANDFATHER_INODE_ID) { 3394 // Older clients may not have given us an inode ID to work with. 3395 // In this case, we have to try to resolve the path and hope it 3396 // hasn't changed or been deleted since the file was opened for write. 3397 iip = dir.getINodesInPath(src, true); 3398 inode = iip.getLastINode(); 3399 } else { 3400 inode = dir.getInode(fileId); 3401 iip = INodesInPath.fromINode(inode); 3402 if (inode != null) { 3403 src = iip.getPath(); 3404 } 3405 } 3406 final INodeFile file = checkLease(src, holder, inode, fileId); 3407 3408 // Remove the block from the pending creates list 3409 boolean removed = dir.removeBlock(src, iip, file, 3410 ExtendedBlock.getLocalBlock(b)); 3411 if (!removed) { 3412 return true; 3413 } 3414 NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: {} is " + 3415 "removed from pendingCreates", b); 3416 persistBlocks(src, file, false); 3417 } finally { 3418 writeUnlock(); 3419 } 3420 getEditLog().logSync(); 3421 3422 return true; 3423 } 3424 3425 private INodeFile checkLease(String src, String holder, INode inode, 3426 long fileId) throws LeaseExpiredException, FileNotFoundException { 3427 assert hasReadLock(); 3428 final String ident = src + " (inode " + fileId + ")"; 3429 if (inode == null) { 3430 Lease lease = leaseManager.getLease(holder); 3431 throw new LeaseExpiredException( 3432 "No lease on " + ident + ": File does not exist. " 3433 + (lease != null ? lease.toString() 3434 : "Holder " + holder + " does not have any open files.")); 3435 } 3436 if (!inode.isFile()) { 3437 Lease lease = leaseManager.getLease(holder); 3438 throw new LeaseExpiredException( 3439 "No lease on " + ident + ": INode is not a regular file. " 3440 + (lease != null ? lease.toString() 3441 : "Holder " + holder + " does not have any open files.")); 3442 } 3443 final INodeFile file = inode.asFile(); 3444 if (!file.isUnderConstruction()) { 3445 Lease lease = leaseManager.getLease(holder); 3446 throw new LeaseExpiredException( 3447 "No lease on " + ident + ": File is not open for writing. " 3448 + (lease != null ? lease.toString() 3449 : "Holder " + holder + " does not have any open files.")); 3450 } 3451 // No further modification is allowed on a deleted file. 3452 // A file is considered deleted, if it is not in the inodeMap or is marked 3453 // as deleted in the snapshot feature. 3454 if (isFileDeleted(file)) { 3455 throw new FileNotFoundException(src); 3456 } 3457 String clientName = file.getFileUnderConstructionFeature().getClientName(); 3458 if (holder != null && !clientName.equals(holder)) { 3459 throw new LeaseExpiredException("Lease mismatch on " + ident + 3460 " owned by " + clientName + " but is accessed by " + holder); 3461 } 3462 return file; 3463 } 3464 3465 /** 3466 * Complete in-progress write to the given file. 3467 * @return true if successful, false if the client should continue to retry 3468 * (e.g if not all blocks have reached minimum replication yet) 3469 * @throws IOException on error (eg lease mismatch, file not open, file deleted) 3470 */ 3471 boolean completeFile(final String srcArg, String holder, 3472 ExtendedBlock last, long fileId) 3473 throws SafeModeException, UnresolvedLinkException, IOException { 3474 String src = srcArg; 3475 NameNode.stateChangeLog.debug("DIR* NameSystem.completeFile: {} for {}", 3476 src, holder); 3477 checkBlock(last); 3478 boolean success = false; 3479 checkOperation(OperationCategory.WRITE); 3480 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src); 3481 FSPermissionChecker pc = getPermissionChecker(); 3482 waitForLoadingFSImage(); 3483 writeLock(); 3484 try { 3485 checkOperation(OperationCategory.WRITE); 3486 checkNameNodeSafeMode("Cannot complete file " + src); 3487 src = dir.resolvePath(pc, src, pathComponents); 3488 success = completeFileInternal(src, holder, 3489 ExtendedBlock.getLocalBlock(last), fileId); 3490 } finally { 3491 writeUnlock(); 3492 } 3493 getEditLog().logSync(); 3494 if (success) { 3495 NameNode.stateChangeLog.info("DIR* completeFile: " + srcArg 3496 + " is closed by " + holder); 3497 } 3498 return success; 3499 } 3500 3501 private boolean completeFileInternal(String src, String holder, Block last, 3502 long fileId) throws IOException { 3503 assert hasWriteLock(); 3504 final INodeFile pendingFile; 3505 final INodesInPath iip; 3506 INode inode = null; 3507 try { 3508 if (fileId == INodeId.GRANDFATHER_INODE_ID) { 3509 // Older clients may not have given us an inode ID to work with. 3510 // In this case, we have to try to resolve the path and hope it 3511 // hasn't changed or been deleted since the file was opened for write. 3512 iip = dir.getINodesInPath(src, true); 3513 inode = iip.getLastINode(); 3514 } else { 3515 inode = dir.getInode(fileId); 3516 iip = INodesInPath.fromINode(inode); 3517 if (inode != null) { 3518 src = iip.getPath(); 3519 } 3520 } 3521 pendingFile = checkLease(src, holder, inode, fileId); 3522 } catch (LeaseExpiredException lee) { 3523 if (inode != null && inode.isFile() && 3524 !inode.asFile().isUnderConstruction()) { 3525 // This could be a retry RPC - i.e the client tried to close 3526 // the file, but missed the RPC response. Thus, it is trying 3527 // again to close the file. If the file still exists and 3528 // the client's view of the last block matches the actual 3529 // last block, then we'll treat it as a successful close. 3530 // See HDFS-3031. 3531 final Block realLastBlock = inode.asFile().getLastBlock(); 3532 if (Block.matchingIdAndGenStamp(last, realLastBlock)) { 3533 NameNode.stateChangeLog.info("DIR* completeFile: " + 3534 "request from " + holder + " to complete inode " + fileId + 3535 "(" + src + ") which is already closed. But, it appears to be " + 3536 "an RPC retry. Returning success"); 3537 return true; 3538 } 3539 } 3540 throw lee; 3541 } 3542 // Check the state of the penultimate block. It should be completed 3543 // before attempting to complete the last one. 3544 if (!checkFileProgress(src, pendingFile, false)) { 3545 return false; 3546 } 3547 3548 // commit the last block and complete it if it has minimum replicas 3549 commitOrCompleteLastBlock(pendingFile, iip, last); 3550 3551 if (!checkFileProgress(src, pendingFile, true)) { 3552 return false; 3553 } 3554 3555 finalizeINodeFileUnderConstruction(src, pendingFile, 3556 Snapshot.CURRENT_STATE_ID); 3557 return true; 3558 } 3559 3560 /** 3561 * Save allocated block at the given pending filename 3562 * 3563 * @param src path to the file 3564 * @param inodesInPath representing each of the components of src. 3565 * The last INode is the INode for {@code src} file. 3566 * @param newBlock newly allocated block to be save 3567 * @param targets target datanodes where replicas of the new block is placed 3568 * @throws QuotaExceededException If addition of block exceeds space quota 3569 */ 3570 BlockInfoContiguous saveAllocatedBlock(String src, INodesInPath inodesInPath, 3571 Block newBlock, DatanodeStorageInfo[] targets) 3572 throws IOException { 3573 assert hasWriteLock(); 3574 BlockInfoContiguous b = dir.addBlock(src, inodesInPath, newBlock, targets); 3575 NameNode.stateChangeLog.info("BLOCK* allocate " + b + " for " + src); 3576 DatanodeStorageInfo.incrementBlocksScheduled(targets); 3577 return b; 3578 } 3579 3580 /** 3581 * Create new block with a unique block id and a new generation stamp. 3582 */ 3583 Block createNewBlock() throws IOException { 3584 assert hasWriteLock(); 3585 Block b = new Block(nextBlockId(), 0, 0); 3586 // Increment the generation stamp for every new block. 3587 b.setGenerationStamp(nextGenerationStamp(false)); 3588 return b; 3589 } 3590 3591 /** 3592 * Check that the indicated file's blocks are present and 3593 * replicated. If not, return false. If checkall is true, then check 3594 * all blocks, otherwise check only penultimate block. 3595 */ 3596 boolean checkFileProgress(String src, INodeFile v, boolean checkall) { 3597 if (checkall) { 3598 // check all blocks of the file. 3599 for (BlockInfoContiguous block: v.getBlocks()) { 3600 if (!isCompleteBlock(src, block, blockManager.minReplication)) { 3601 return false; 3602 } 3603 } 3604 } else { 3605 // check the penultimate block of this file 3606 BlockInfoContiguous b = v.getPenultimateBlock(); 3607 if (b != null 3608 && !isCompleteBlock(src, b, blockManager.minReplication)) { 3609 return false; 3610 } 3611 } 3612 return true; 3613 } 3614 3615 private static boolean isCompleteBlock(String src, BlockInfoContiguous b, int minRepl) { 3616 if (!b.isComplete()) { 3617 final BlockInfoContiguousUnderConstruction uc = (BlockInfoContiguousUnderConstruction)b; 3618 final int numNodes = b.numNodes(); 3619 LOG.info("BLOCK* " + b + " is not COMPLETE (ucState = " 3620 + uc.getBlockUCState() + ", replication# = " + numNodes 3621 + (numNodes < minRepl? " < ": " >= ") 3622 + " minimum = " + minRepl + ") in file " + src); 3623 return false; 3624 } 3625 return true; 3626 } 3627 3628 //////////////////////////////////////////////////////////////// 3629 // Here's how to handle block-copy failure during client write: 3630 // -- As usual, the client's write should result in a streaming 3631 // backup write to a k-machine sequence. 3632 // -- If one of the backup machines fails, no worries. Fail silently. 3633 // -- Before client is allowed to close and finalize file, make sure 3634 // that the blocks are backed up. Namenode may have to issue specific backup 3635 // commands to make up for earlier datanode failures. Once all copies 3636 // are made, edit namespace and return to client. 3637 //////////////////////////////////////////////////////////////// 3638 3639 /** 3640 * Change the indicated filename. 3641 * @deprecated Use {@link #renameTo(String, String, boolean, 3642 * Options.Rename...)} instead. 3643 */ 3644 @Deprecated 3645 boolean renameTo(String src, String dst, boolean logRetryCache) 3646 throws IOException { 3647 waitForLoadingFSImage(); 3648 checkOperation(OperationCategory.WRITE); 3649 FSDirRenameOp.RenameOldResult ret = null; 3650 writeLock(); 3651 try { 3652 checkOperation(OperationCategory.WRITE); 3653 checkNameNodeSafeMode("Cannot rename " + src); 3654 ret = FSDirRenameOp.renameToInt(dir, src, dst, logRetryCache); 3655 } catch (AccessControlException e) { 3656 logAuditEvent(false, "rename", src, dst, null); 3657 throw e; 3658 } finally { 3659 writeUnlock(); 3660 } 3661 boolean success = ret != null && ret.success; 3662 if (success) { 3663 getEditLog().logSync(); 3664 } 3665 logAuditEvent(success, "rename", src, dst, 3666 ret == null ? null : ret.auditStat); 3667 return success; 3668 } 3669 3670 void renameTo(final String src, final String dst, 3671 boolean logRetryCache, Options.Rename... options) 3672 throws IOException { 3673 waitForLoadingFSImage(); 3674 checkOperation(OperationCategory.WRITE); 3675 Map.Entry<BlocksMapUpdateInfo, HdfsFileStatus> res = null; 3676 writeLock(); 3677 try { 3678 checkOperation(OperationCategory.WRITE); 3679 checkNameNodeSafeMode("Cannot rename " + src); 3680 res = FSDirRenameOp.renameToInt(dir, src, dst, logRetryCache, options); 3681 } catch (AccessControlException e) { 3682 logAuditEvent(false, "rename (options=" + Arrays.toString(options) + 3683 ")", src, dst, null); 3684 throw e; 3685 } finally { 3686 writeUnlock(); 3687 } 3688 3689 getEditLog().logSync(); 3690 3691 BlocksMapUpdateInfo collectedBlocks = res.getKey(); 3692 HdfsFileStatus auditStat = res.getValue(); 3693 if (!collectedBlocks.getToDeleteList().isEmpty()) { 3694 removeBlocks(collectedBlocks); 3695 collectedBlocks.clear(); 3696 } 3697 3698 logAuditEvent(true, "rename (options=" + Arrays.toString(options) + 3699 ")", src, dst, auditStat); 3700 } 3701 3702 /** 3703 * Remove the indicated file from namespace. 3704 * 3705 * @see ClientProtocol#delete(String, boolean) for detailed description and 3706 * description of exceptions 3707 */ 3708 boolean delete(String src, boolean recursive, boolean logRetryCache) 3709 throws IOException { 3710 waitForLoadingFSImage(); 3711 checkOperation(OperationCategory.WRITE); 3712 BlocksMapUpdateInfo toRemovedBlocks = null; 3713 writeLock(); 3714 boolean ret = false; 3715 try { 3716 checkOperation(OperationCategory.WRITE); 3717 checkNameNodeSafeMode("Cannot delete " + src); 3718 toRemovedBlocks = FSDirDeleteOp.delete( 3719 this, src, recursive, logRetryCache); 3720 ret = toRemovedBlocks != null; 3721 } catch (AccessControlException e) { 3722 logAuditEvent(false, "delete", src); 3723 throw e; 3724 } finally { 3725 writeUnlock(); 3726 } 3727 getEditLog().logSync(); 3728 if (toRemovedBlocks != null) { 3729 removeBlocks(toRemovedBlocks); // Incremental deletion of blocks 3730 } 3731 logAuditEvent(true, "delete", src); 3732 return ret; 3733 } 3734 3735 FSPermissionChecker getPermissionChecker() 3736 throws AccessControlException { 3737 return dir.getPermissionChecker(); 3738 } 3739 3740 /** 3741 * From the given list, incrementally remove the blocks from blockManager 3742 * Writelock is dropped and reacquired every BLOCK_DELETION_INCREMENT to 3743 * ensure that other waiters on the lock can get in. See HDFS-2938 3744 * 3745 * @param blocks 3746 * An instance of {@link BlocksMapUpdateInfo} which contains a list 3747 * of blocks that need to be removed from blocksMap 3748 */ 3749 void removeBlocks(BlocksMapUpdateInfo blocks) { 3750 List<Block> toDeleteList = blocks.getToDeleteList(); 3751 Iterator<Block> iter = toDeleteList.iterator(); 3752 while (iter.hasNext()) { 3753 writeLock(); 3754 try { 3755 for (int i = 0; i < BLOCK_DELETION_INCREMENT && iter.hasNext(); i++) { 3756 blockManager.removeBlock(iter.next()); 3757 } 3758 } finally { 3759 writeUnlock(); 3760 } 3761 } 3762 } 3763 3764 /** 3765 * Remove leases and inodes related to a given path 3766 * @param src The given path 3767 * @param removedINodes Containing the list of inodes to be removed from 3768 * inodesMap 3769 * @param acquireINodeMapLock Whether to acquire the lock for inode removal 3770 */ 3771 void removeLeasesAndINodes(String src, List<INode> removedINodes, 3772 final boolean acquireINodeMapLock) { 3773 assert hasWriteLock(); 3774 leaseManager.removeLeaseWithPrefixPath(src); 3775 // remove inodes from inodesMap 3776 if (removedINodes != null) { 3777 if (acquireINodeMapLock) { 3778 dir.writeLock(); 3779 } 3780 try { 3781 dir.removeFromInodeMap(removedINodes); 3782 } finally { 3783 if (acquireINodeMapLock) { 3784 dir.writeUnlock(); 3785 } 3786 } 3787 removedINodes.clear(); 3788 } 3789 } 3790 3791 /** 3792 * Removes the blocks from blocksmap and updates the safemode blocks total 3793 * 3794 * @param blocks 3795 * An instance of {@link BlocksMapUpdateInfo} which contains a list 3796 * of blocks that need to be removed from blocksMap 3797 */ 3798 void removeBlocksAndUpdateSafemodeTotal(BlocksMapUpdateInfo blocks) { 3799 assert hasWriteLock(); 3800 // In the case that we are a Standby tailing edits from the 3801 // active while in safe-mode, we need to track the total number 3802 // of blocks and safe blocks in the system. 3803 boolean trackBlockCounts = isSafeModeTrackingBlocks(); 3804 int numRemovedComplete = 0, numRemovedSafe = 0; 3805 3806 for (Block b : blocks.getToDeleteList()) { 3807 if (trackBlockCounts) { 3808 BlockInfoContiguous bi = getStoredBlock(b); 3809 if (bi.isComplete()) { 3810 numRemovedComplete++; 3811 if (bi.numNodes() >= blockManager.minReplication) { 3812 numRemovedSafe++; 3813 } 3814 } 3815 } 3816 blockManager.removeBlock(b); 3817 } 3818 if (trackBlockCounts) { 3819 if (LOG.isDebugEnabled()) { 3820 LOG.debug("Adjusting safe-mode totals for deletion." 3821 + "decreasing safeBlocks by " + numRemovedSafe 3822 + ", totalBlocks by " + numRemovedComplete); 3823 } 3824 adjustSafeModeBlockTotals(-numRemovedSafe, -numRemovedComplete); 3825 } 3826 } 3827 3828 /** 3829 * @see SafeModeInfo#shouldIncrementallyTrackBlocks 3830 */ 3831 private boolean isSafeModeTrackingBlocks() { 3832 if (!haEnabled) { 3833 // Never track blocks incrementally in non-HA code. 3834 return false; 3835 } 3836 SafeModeInfo sm = this.safeMode; 3837 return sm != null && sm.shouldIncrementallyTrackBlocks(); 3838 } 3839 3840 /** 3841 * Get the file info for a specific file. 3842 * 3843 * @param src The string representation of the path to the file 3844 * @param resolveLink whether to throw UnresolvedLinkException 3845 * if src refers to a symlink 3846 * 3847 * @throws AccessControlException if access is denied 3848 * @throws UnresolvedLinkException if a symlink is encountered. 3849 * 3850 * @return object containing information regarding the file 3851 * or null if file not found 3852 * @throws StandbyException 3853 */ 3854 HdfsFileStatus getFileInfo(final String src, boolean resolveLink) 3855 throws IOException { 3856 checkOperation(OperationCategory.READ); 3857 HdfsFileStatus stat = null; 3858 readLock(); 3859 try { 3860 checkOperation(OperationCategory.READ); 3861 stat = FSDirStatAndListingOp.getFileInfo(dir, src, resolveLink); 3862 } catch (AccessControlException e) { 3863 logAuditEvent(false, "getfileinfo", src); 3864 throw e; 3865 } finally { 3866 readUnlock(); 3867 } 3868 logAuditEvent(true, "getfileinfo", src); 3869 return stat; 3870 } 3871 3872 /** 3873 * Returns true if the file is closed 3874 */ 3875 boolean isFileClosed(final String src) throws IOException { 3876 checkOperation(OperationCategory.READ); 3877 readLock(); 3878 try { 3879 checkOperation(OperationCategory.READ); 3880 return FSDirStatAndListingOp.isFileClosed(dir, src); 3881 } catch (AccessControlException e) { 3882 logAuditEvent(false, "isFileClosed", src); 3883 throw e; 3884 } finally { 3885 readUnlock(); 3886 } 3887 } 3888 3889 /** 3890 * Create all the necessary directories 3891 */ 3892 boolean mkdirs(String src, PermissionStatus permissions, 3893 boolean createParent) throws IOException { 3894 HdfsFileStatus auditStat = null; 3895 checkOperation(OperationCategory.WRITE); 3896 writeLock(); 3897 try { 3898 checkOperation(OperationCategory.WRITE); 3899 checkNameNodeSafeMode("Cannot create directory " + src); 3900 auditStat = FSDirMkdirOp.mkdirs(this, src, permissions, createParent); 3901 } catch (AccessControlException e) { 3902 logAuditEvent(false, "mkdirs", src); 3903 throw e; 3904 } finally { 3905 writeUnlock(); 3906 } 3907 getEditLog().logSync(); 3908 logAuditEvent(true, "mkdirs", src, null, auditStat); 3909 return true; 3910 } 3911 3912 /** 3913 * Get the content summary for a specific file/dir. 3914 * 3915 * @param src The string representation of the path to the file 3916 * 3917 * @throws AccessControlException if access is denied 3918 * @throws UnresolvedLinkException if a symlink is encountered. 3919 * @throws FileNotFoundException if no file exists 3920 * @throws StandbyException 3921 * @throws IOException for issues with writing to the audit log 3922 * 3923 * @return object containing information regarding the file 3924 * or null if file not found 3925 */ 3926 ContentSummary getContentSummary(final String src) throws IOException { 3927 readLock(); 3928 boolean success = true; 3929 try { 3930 return FSDirStatAndListingOp.getContentSummary(dir, src); 3931 } catch (AccessControlException ace) { 3932 success = false; 3933 throw ace; 3934 } finally { 3935 readUnlock(); 3936 logAuditEvent(success, "contentSummary", src); 3937 } 3938 } 3939 3940 /** 3941 * Set the namespace quota and storage space quota for a directory. 3942 * See {@link ClientProtocol#setQuota(String, long, long, StorageType)} for the 3943 * contract. 3944 * 3945 * Note: This does not support ".inodes" relative path. 3946 */ 3947 void setQuota(String src, long nsQuota, long ssQuota, StorageType type) 3948 throws IOException { 3949 checkOperation(OperationCategory.WRITE); 3950 writeLock(); 3951 boolean success = false; 3952 try { 3953 checkOperation(OperationCategory.WRITE); 3954 checkNameNodeSafeMode("Cannot set quota on " + src); 3955 FSDirAttrOp.setQuota(dir, src, nsQuota, ssQuota, type); 3956 success = true; 3957 } finally { 3958 writeUnlock(); 3959 if (success) { 3960 getEditLog().logSync(); 3961 } 3962 logAuditEvent(success, "setQuota", src); 3963 } 3964 } 3965 3966 /** Persist all metadata about this file. 3967 * @param src The string representation of the path 3968 * @param fileId The inode ID that we're fsyncing. Older clients will pass 3969 * INodeId.GRANDFATHER_INODE_ID here. 3970 * @param clientName The string representation of the client 3971 * @param lastBlockLength The length of the last block 3972 * under construction reported from client. 3973 * @throws IOException if path does not exist 3974 */ 3975 void fsync(String src, long fileId, String clientName, long lastBlockLength) 3976 throws IOException { 3977 NameNode.stateChangeLog.info("BLOCK* fsync: " + src + " for " + clientName); 3978 checkOperation(OperationCategory.WRITE); 3979 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src); 3980 3981 FSPermissionChecker pc = getPermissionChecker(); 3982 waitForLoadingFSImage(); 3983 writeLock(); 3984 try { 3985 checkOperation(OperationCategory.WRITE); 3986 checkNameNodeSafeMode("Cannot fsync file " + src); 3987 src = dir.resolvePath(pc, src, pathComponents); 3988 final INode inode; 3989 if (fileId == INodeId.GRANDFATHER_INODE_ID) { 3990 // Older clients may not have given us an inode ID to work with. 3991 // In this case, we have to try to resolve the path and hope it 3992 // hasn't changed or been deleted since the file was opened for write. 3993 inode = dir.getINode(src); 3994 } else { 3995 inode = dir.getInode(fileId); 3996 if (inode != null) src = inode.getFullPathName(); 3997 } 3998 final INodeFile pendingFile = checkLease(src, clientName, inode, fileId); 3999 if (lastBlockLength > 0) { 4000 pendingFile.getFileUnderConstructionFeature().updateLengthOfLastBlock( 4001 pendingFile, lastBlockLength); 4002 } 4003 persistBlocks(src, pendingFile, false); 4004 } finally { 4005 writeUnlock(); 4006 } 4007 getEditLog().logSync(); 4008 } 4009 4010 /** 4011 * Move a file that is being written to be immutable. 4012 * @param src The filename 4013 * @param lease The lease for the client creating the file 4014 * @param recoveryLeaseHolder reassign lease to this holder if the last block 4015 * needs recovery; keep current holder if null. 4016 * @throws AlreadyBeingCreatedException if file is waiting to achieve minimal 4017 * replication;<br> 4018 * RecoveryInProgressException if lease recovery is in progress.<br> 4019 * IOException in case of an error. 4020 * @return true if file has been successfully finalized and closed or 4021 * false if block recovery has been initiated. Since the lease owner 4022 * has been changed and logged, caller should call logSync(). 4023 */ 4024 boolean internalReleaseLease(Lease lease, String src, INodesInPath iip, 4025 String recoveryLeaseHolder) throws IOException { 4026 LOG.info("Recovering " + lease + ", src=" + src); 4027 assert !isInSafeMode(); 4028 assert hasWriteLock(); 4029 4030 final INodeFile pendingFile = iip.getLastINode().asFile(); 4031 int nrBlocks = pendingFile.numBlocks(); 4032 BlockInfoContiguous[] blocks = pendingFile.getBlocks(); 4033 4034 int nrCompleteBlocks; 4035 BlockInfoContiguous curBlock = null; 4036 for(nrCompleteBlocks = 0; nrCompleteBlocks < nrBlocks; nrCompleteBlocks++) { 4037 curBlock = blocks[nrCompleteBlocks]; 4038 if(!curBlock.isComplete()) 4039 break; 4040 assert blockManager.checkMinReplication(curBlock) : 4041 "A COMPLETE block is not minimally replicated in " + src; 4042 } 4043 4044 // If there are no incomplete blocks associated with this file, 4045 // then reap lease immediately and close the file. 4046 if(nrCompleteBlocks == nrBlocks) { 4047 finalizeINodeFileUnderConstruction(src, pendingFile, 4048 iip.getLatestSnapshotId()); 4049 NameNode.stateChangeLog.warn("BLOCK*" 4050 + " internalReleaseLease: All existing blocks are COMPLETE," 4051 + " lease removed, file closed."); 4052 return true; // closed! 4053 } 4054 4055 // Only the last and the penultimate blocks may be in non COMPLETE state. 4056 // If the penultimate block is not COMPLETE, then it must be COMMITTED. 4057 if(nrCompleteBlocks < nrBlocks - 2 || 4058 nrCompleteBlocks == nrBlocks - 2 && 4059 curBlock != null && 4060 curBlock.getBlockUCState() != BlockUCState.COMMITTED) { 4061 final String message = "DIR* NameSystem.internalReleaseLease: " 4062 + "attempt to release a create lock on " 4063 + src + " but file is already closed."; 4064 NameNode.stateChangeLog.warn(message); 4065 throw new IOException(message); 4066 } 4067 4068 // The last block is not COMPLETE, and 4069 // that the penultimate block if exists is either COMPLETE or COMMITTED 4070 final BlockInfoContiguous lastBlock = pendingFile.getLastBlock(); 4071 BlockUCState lastBlockState = lastBlock.getBlockUCState(); 4072 BlockInfoContiguous penultimateBlock = pendingFile.getPenultimateBlock(); 4073 4074 // If penultimate block doesn't exist then its minReplication is met 4075 boolean penultimateBlockMinReplication = penultimateBlock == null ? true : 4076 blockManager.checkMinReplication(penultimateBlock); 4077 4078 switch(lastBlockState) { 4079 case COMPLETE: 4080 assert false : "Already checked that the last block is incomplete"; 4081 break; 4082 case COMMITTED: 4083 // Close file if committed blocks are minimally replicated 4084 if(penultimateBlockMinReplication && 4085 blockManager.checkMinReplication(lastBlock)) { 4086 finalizeINodeFileUnderConstruction(src, pendingFile, 4087 iip.getLatestSnapshotId()); 4088 NameNode.stateChangeLog.warn("BLOCK*" 4089 + " internalReleaseLease: Committed blocks are minimally replicated," 4090 + " lease removed, file closed."); 4091 return true; // closed! 4092 } 4093 // Cannot close file right now, since some blocks 4094 // are not yet minimally replicated. 4095 // This may potentially cause infinite loop in lease recovery 4096 // if there are no valid replicas on data-nodes. 4097 String message = "DIR* NameSystem.internalReleaseLease: " + 4098 "Failed to release lease for file " + src + 4099 ". Committed blocks are waiting to be minimally replicated." + 4100 " Try again later."; 4101 NameNode.stateChangeLog.warn(message); 4102 throw new AlreadyBeingCreatedException(message); 4103 case UNDER_CONSTRUCTION: 4104 case UNDER_RECOVERY: 4105 final BlockInfoContiguousUnderConstruction uc = (BlockInfoContiguousUnderConstruction)lastBlock; 4106 // determine if last block was intended to be truncated 4107 Block recoveryBlock = uc.getTruncateBlock(); 4108 boolean truncateRecovery = recoveryBlock != null; 4109 boolean copyOnTruncate = truncateRecovery && 4110 recoveryBlock.getBlockId() != uc.getBlockId(); 4111 assert !copyOnTruncate || 4112 recoveryBlock.getBlockId() < uc.getBlockId() && 4113 recoveryBlock.getGenerationStamp() < uc.getGenerationStamp() && 4114 recoveryBlock.getNumBytes() > uc.getNumBytes() : 4115 "wrong recoveryBlock"; 4116 4117 // setup the last block locations from the blockManager if not known 4118 if (uc.getNumExpectedLocations() == 0) { 4119 uc.setExpectedLocations(blockManager.getStorages(lastBlock)); 4120 } 4121 4122 if (uc.getNumExpectedLocations() == 0 && uc.getNumBytes() == 0) { 4123 // There is no datanode reported to this block. 4124 // may be client have crashed before writing data to pipeline. 4125 // This blocks doesn't need any recovery. 4126 // We can remove this block and close the file. 4127 pendingFile.removeLastBlock(lastBlock); 4128 finalizeINodeFileUnderConstruction(src, pendingFile, 4129 iip.getLatestSnapshotId()); 4130 NameNode.stateChangeLog.warn("BLOCK* internalReleaseLease: " 4131 + "Removed empty last block and closed file."); 4132 return true; 4133 } 4134 // start recovery of the last block for this file 4135 long blockRecoveryId = nextGenerationStamp(blockIdManager.isLegacyBlock(uc)); 4136 lease = reassignLease(lease, src, recoveryLeaseHolder, pendingFile); 4137 if(copyOnTruncate) { 4138 uc.setGenerationStamp(blockRecoveryId); 4139 } else if(truncateRecovery) { 4140 recoveryBlock.setGenerationStamp(blockRecoveryId); 4141 } 4142 uc.initializeBlockRecovery(blockRecoveryId); 4143 leaseManager.renewLease(lease); 4144 // Cannot close file right now, since the last block requires recovery. 4145 // This may potentially cause infinite loop in lease recovery 4146 // if there are no valid replicas on data-nodes. 4147 NameNode.stateChangeLog.warn( 4148 "DIR* NameSystem.internalReleaseLease: " + 4149 "File " + src + " has not been closed." + 4150 " Lease recovery is in progress. " + 4151 "RecoveryId = " + blockRecoveryId + " for block " + lastBlock); 4152 break; 4153 } 4154 return false; 4155 } 4156 4157 private Lease reassignLease(Lease lease, String src, String newHolder, 4158 INodeFile pendingFile) { 4159 assert hasWriteLock(); 4160 if(newHolder == null) 4161 return lease; 4162 // The following transaction is not synced. Make sure it's sync'ed later. 4163 logReassignLease(lease.getHolder(), src, newHolder); 4164 return reassignLeaseInternal(lease, src, newHolder, pendingFile); 4165 } 4166 4167 Lease reassignLeaseInternal(Lease lease, String src, String newHolder, 4168 INodeFile pendingFile) { 4169 assert hasWriteLock(); 4170 pendingFile.getFileUnderConstructionFeature().setClientName(newHolder); 4171 return leaseManager.reassignLease(lease, src, newHolder); 4172 } 4173 4174 private void commitOrCompleteLastBlock(final INodeFile fileINode, 4175 final INodesInPath iip, final Block commitBlock) throws IOException { 4176 assert hasWriteLock(); 4177 Preconditions.checkArgument(fileINode.isUnderConstruction()); 4178 if (!blockManager.commitOrCompleteLastBlock(fileINode, commitBlock)) { 4179 return; 4180 } 4181 4182 // Adjust disk space consumption if required 4183 final long diff = fileINode.getPreferredBlockSize() - commitBlock.getNumBytes(); 4184 if (diff > 0) { 4185 try { 4186 dir.updateSpaceConsumed(iip, 0, -diff, fileINode.getFileReplication()); 4187 } catch (IOException e) { 4188 LOG.warn("Unexpected exception while updating disk space.", e); 4189 } 4190 } 4191 } 4192 4193 private void finalizeINodeFileUnderConstruction(String src, 4194 INodeFile pendingFile, int latestSnapshot) throws IOException { 4195 assert hasWriteLock(); 4196 4197 FileUnderConstructionFeature uc = pendingFile.getFileUnderConstructionFeature(); 4198 Preconditions.checkArgument(uc != null); 4199 leaseManager.removeLease(uc.getClientName(), src); 4200 4201 pendingFile.recordModification(latestSnapshot); 4202 4203 // The file is no longer pending. 4204 // Create permanent INode, update blocks. No need to replace the inode here 4205 // since we just remove the uc feature from pendingFile 4206 pendingFile.toCompleteFile(now()); 4207 4208 waitForLoadingFSImage(); 4209 // close file and persist block allocations for this file 4210 closeFile(src, pendingFile); 4211 4212 blockManager.checkReplication(pendingFile); 4213 } 4214 4215 @VisibleForTesting 4216 BlockInfoContiguous getStoredBlock(Block block) { 4217 return blockManager.getStoredBlock(block); 4218 } 4219 4220 @Override 4221 public boolean isInSnapshot(BlockInfoContiguousUnderConstruction blockUC) { 4222 assert hasReadLock(); 4223 final BlockCollection bc = blockUC.getBlockCollection(); 4224 if (bc == null || !(bc instanceof INodeFile) 4225 || !bc.isUnderConstruction()) { 4226 return false; 4227 } 4228 4229 String fullName = bc.getName(); 4230 try { 4231 if (fullName != null && fullName.startsWith(Path.SEPARATOR) 4232 && dir.getINode(fullName) == bc) { 4233 // If file exists in normal path then no need to look in snapshot 4234 return false; 4235 } 4236 } catch (UnresolvedLinkException e) { 4237 LOG.error("Error while resolving the link : " + fullName, e); 4238 return false; 4239 } 4240 /* 4241 * 1. if bc is under construction and also with snapshot, and 4242 * bc is not in the current fsdirectory tree, bc must represent a snapshot 4243 * file. 4244 * 2. if fullName is not an absolute path, bc cannot be existent in the 4245 * current fsdirectory tree. 4246 * 3. if bc is not the current node associated with fullName, bc must be a 4247 * snapshot inode. 4248 */ 4249 return true; 4250 } 4251 4252 void commitBlockSynchronization(ExtendedBlock oldBlock, 4253 long newgenerationstamp, long newlength, 4254 boolean closeFile, boolean deleteblock, DatanodeID[] newtargets, 4255 String[] newtargetstorages) throws IOException { 4256 LOG.info("commitBlockSynchronization(oldBlock=" + oldBlock 4257 + ", newgenerationstamp=" + newgenerationstamp 4258 + ", newlength=" + newlength 4259 + ", newtargets=" + Arrays.asList(newtargets) 4260 + ", closeFile=" + closeFile 4261 + ", deleteBlock=" + deleteblock 4262 + ")"); 4263 checkOperation(OperationCategory.WRITE); 4264 String src = ""; 4265 waitForLoadingFSImage(); 4266 writeLock(); 4267 try { 4268 checkOperation(OperationCategory.WRITE); 4269 // If a DN tries to commit to the standby, the recovery will 4270 // fail, and the next retry will succeed on the new NN. 4271 4272 checkNameNodeSafeMode( 4273 "Cannot commitBlockSynchronization while in safe mode"); 4274 final BlockInfoContiguous storedBlock = getStoredBlock( 4275 ExtendedBlock.getLocalBlock(oldBlock)); 4276 if (storedBlock == null) { 4277 if (deleteblock) { 4278 // This may be a retry attempt so ignore the failure 4279 // to locate the block. 4280 if (LOG.isDebugEnabled()) { 4281 LOG.debug("Block (=" + oldBlock + ") not found"); 4282 } 4283 return; 4284 } else { 4285 throw new IOException("Block (=" + oldBlock + ") not found"); 4286 } 4287 } 4288 final long oldGenerationStamp = storedBlock.getGenerationStamp(); 4289 final long oldNumBytes = storedBlock.getNumBytes(); 4290 // 4291 // The implementation of delete operation (see @deleteInternal method) 4292 // first removes the file paths from namespace, and delays the removal 4293 // of blocks to later time for better performance. When 4294 // commitBlockSynchronization (this method) is called in between, the 4295 // blockCollection of storedBlock could have been assigned to null by 4296 // the delete operation, throw IOException here instead of NPE; if the 4297 // file path is already removed from namespace by the delete operation, 4298 // throw FileNotFoundException here, so not to proceed to the end of 4299 // this method to add a CloseOp to the edit log for an already deleted 4300 // file (See HDFS-6825). 4301 // 4302 BlockCollection blockCollection = storedBlock.getBlockCollection(); 4303 if (blockCollection == null) { 4304 throw new IOException("The blockCollection of " + storedBlock 4305 + " is null, likely because the file owning this block was" 4306 + " deleted and the block removal is delayed"); 4307 } 4308 INodeFile iFile = ((INode)blockCollection).asFile(); 4309 if (isFileDeleted(iFile)) { 4310 throw new FileNotFoundException("File not found: " 4311 + iFile.getFullPathName() + ", likely due to delayed block" 4312 + " removal"); 4313 } 4314 if ((!iFile.isUnderConstruction() || storedBlock.isComplete()) && 4315 iFile.getLastBlock().isComplete()) { 4316 if (LOG.isDebugEnabled()) { 4317 LOG.debug("Unexpected block (=" + oldBlock 4318 + ") since the file (=" + iFile.getLocalName() 4319 + ") is not under construction"); 4320 } 4321 return; 4322 } 4323 4324 BlockInfoContiguousUnderConstruction truncatedBlock = 4325 (BlockInfoContiguousUnderConstruction) iFile.getLastBlock(); 4326 long recoveryId = truncatedBlock.getBlockRecoveryId(); 4327 boolean copyTruncate = 4328 truncatedBlock.getBlockId() != storedBlock.getBlockId(); 4329 if(recoveryId != newgenerationstamp) { 4330 throw new IOException("The recovery id " + newgenerationstamp 4331 + " does not match current recovery id " 4332 + recoveryId + " for block " + oldBlock); 4333 } 4334 4335 if (deleteblock) { 4336 Block blockToDel = ExtendedBlock.getLocalBlock(oldBlock); 4337 boolean remove = iFile.removeLastBlock(blockToDel); 4338 if (remove) { 4339 blockManager.removeBlock(storedBlock); 4340 } 4341 } 4342 else { 4343 // update last block 4344 if(!copyTruncate) { 4345 storedBlock.setGenerationStamp(newgenerationstamp); 4346 storedBlock.setNumBytes(newlength); 4347 } 4348 4349 // find the DatanodeDescriptor objects 4350 ArrayList<DatanodeDescriptor> trimmedTargets = 4351 new ArrayList<DatanodeDescriptor>(newtargets.length); 4352 ArrayList<String> trimmedStorages = 4353 new ArrayList<String>(newtargets.length); 4354 if (newtargets.length > 0) { 4355 for (int i = 0; i < newtargets.length; ++i) { 4356 // try to get targetNode 4357 DatanodeDescriptor targetNode = 4358 blockManager.getDatanodeManager().getDatanode(newtargets[i]); 4359 if (targetNode != null) { 4360 trimmedTargets.add(targetNode); 4361 trimmedStorages.add(newtargetstorages[i]); 4362 } else if (LOG.isDebugEnabled()) { 4363 LOG.debug("DatanodeDescriptor (=" + newtargets[i] + ") not found"); 4364 } 4365 } 4366 } 4367 if ((closeFile) && !trimmedTargets.isEmpty()) { 4368 // the file is getting closed. Insert block locations into blockManager. 4369 // Otherwise fsck will report these blocks as MISSING, especially if the 4370 // blocksReceived from Datanodes take a long time to arrive. 4371 for (int i = 0; i < trimmedTargets.size(); i++) { 4372 DatanodeStorageInfo storageInfo = 4373 trimmedTargets.get(i).getStorageInfo(trimmedStorages.get(i)); 4374 if (storageInfo != null) { 4375 if(copyTruncate) { 4376 storageInfo.addBlock(truncatedBlock); 4377 } else { 4378 storageInfo.addBlock(storedBlock); 4379 } 4380 } 4381 } 4382 } 4383 4384 // add pipeline locations into the INodeUnderConstruction 4385 DatanodeStorageInfo[] trimmedStorageInfos = 4386 blockManager.getDatanodeManager().getDatanodeStorageInfos( 4387 trimmedTargets.toArray(new DatanodeID[trimmedTargets.size()]), 4388 trimmedStorages.toArray(new String[trimmedStorages.size()])); 4389 if(copyTruncate) { 4390 iFile.setLastBlock(truncatedBlock, trimmedStorageInfos); 4391 } else { 4392 iFile.setLastBlock(storedBlock, trimmedStorageInfos); 4393 if (closeFile) { 4394 blockManager.markBlockReplicasAsCorrupt(storedBlock, 4395 oldGenerationStamp, oldNumBytes, trimmedStorageInfos); 4396 } 4397 } 4398 } 4399 4400 if (closeFile) { 4401 if(copyTruncate) { 4402 src = closeFileCommitBlocks(iFile, truncatedBlock); 4403 if(!iFile.isBlockInLatestSnapshot(storedBlock)) { 4404 blockManager.removeBlock(storedBlock); 4405 } 4406 } else { 4407 src = closeFileCommitBlocks(iFile, storedBlock); 4408 } 4409 } else { 4410 // If this commit does not want to close the file, persist blocks 4411 src = iFile.getFullPathName(); 4412 persistBlocks(src, iFile, false); 4413 } 4414 } finally { 4415 writeUnlock(); 4416 } 4417 getEditLog().logSync(); 4418 if (closeFile) { 4419 LOG.info("commitBlockSynchronization(oldBlock=" + oldBlock 4420 + ", file=" + src 4421 + ", newgenerationstamp=" + newgenerationstamp 4422 + ", newlength=" + newlength 4423 + ", newtargets=" + Arrays.asList(newtargets) + ") successful"); 4424 } else { 4425 LOG.info("commitBlockSynchronization(" + oldBlock + ") successful"); 4426 } 4427 } 4428 4429 /** 4430 * @param pendingFile open file that needs to be closed 4431 * @param storedBlock last block 4432 * @return Path of the file that was closed. 4433 * @throws IOException on error 4434 */ 4435 @VisibleForTesting 4436 String closeFileCommitBlocks(INodeFile pendingFile, BlockInfoContiguous storedBlock) 4437 throws IOException { 4438 final INodesInPath iip = INodesInPath.fromINode(pendingFile); 4439 final String src = iip.getPath(); 4440 4441 // commit the last block and complete it if it has minimum replicas 4442 commitOrCompleteLastBlock(pendingFile, iip, storedBlock); 4443 4444 //remove lease, close file 4445 finalizeINodeFileUnderConstruction(src, pendingFile, 4446 Snapshot.findLatestSnapshot(pendingFile, Snapshot.CURRENT_STATE_ID)); 4447 4448 return src; 4449 } 4450 4451 /** 4452 * Renew the lease(s) held by the given client 4453 */ 4454 void renewLease(String holder) throws IOException { 4455 checkOperation(OperationCategory.WRITE); 4456 readLock(); 4457 try { 4458 checkOperation(OperationCategory.WRITE); 4459 checkNameNodeSafeMode("Cannot renew lease for " + holder); 4460 leaseManager.renewLease(holder); 4461 } finally { 4462 readUnlock(); 4463 } 4464 } 4465 4466 /** 4467 * Get a partial listing of the indicated directory 4468 * 4469 * @param src the directory name 4470 * @param startAfter the name to start after 4471 * @param needLocation if blockLocations need to be returned 4472 * @return a partial listing starting after startAfter 4473 * 4474 * @throws AccessControlException if access is denied 4475 * @throws UnresolvedLinkException if symbolic link is encountered 4476 * @throws IOException if other I/O error occurred 4477 */ 4478 DirectoryListing getListing(String src, byte[] startAfter, 4479 boolean needLocation) 4480 throws IOException { 4481 checkOperation(OperationCategory.READ); 4482 DirectoryListing dl = null; 4483 readLock(); 4484 try { 4485 checkOperation(NameNode.OperationCategory.READ); 4486 dl = FSDirStatAndListingOp.getListingInt(dir, src, startAfter, 4487 needLocation); 4488 } catch (AccessControlException e) { 4489 logAuditEvent(false, "listStatus", src); 4490 throw e; 4491 } finally { 4492 readUnlock(); 4493 } 4494 logAuditEvent(true, "listStatus", src); 4495 return dl; 4496 } 4497 4498 ///////////////////////////////////////////////////////// 4499 // 4500 // These methods are called by datanodes 4501 // 4502 ///////////////////////////////////////////////////////// 4503 /** 4504 * Register Datanode. 4505 * <p> 4506 * The purpose of registration is to identify whether the new datanode 4507 * serves a new data storage, and will report new data block copies, 4508 * which the namenode was not aware of; or the datanode is a replacement 4509 * node for the data storage that was previously served by a different 4510 * or the same (in terms of host:port) datanode. 4511 * The data storages are distinguished by their storageIDs. When a new 4512 * data storage is reported the namenode issues a new unique storageID. 4513 * <p> 4514 * Finally, the namenode returns its namespaceID as the registrationID 4515 * for the datanodes. 4516 * namespaceID is a persistent attribute of the name space. 4517 * The registrationID is checked every time the datanode is communicating 4518 * with the namenode. 4519 * Datanodes with inappropriate registrationID are rejected. 4520 * If the namenode stops, and then restarts it can restore its 4521 * namespaceID and will continue serving the datanodes that has previously 4522 * registered with the namenode without restarting the whole cluster. 4523 * 4524 * @see org.apache.hadoop.hdfs.server.datanode.DataNode 4525 */ 4526 void registerDatanode(DatanodeRegistration nodeReg) throws IOException { 4527 writeLock(); 4528 try { 4529 getBlockManager().getDatanodeManager().registerDatanode(nodeReg); 4530 checkSafeMode(); 4531 } finally { 4532 writeUnlock(); 4533 } 4534 } 4535 4536 /** 4537 * Get registrationID for datanodes based on the namespaceID. 4538 * 4539 * @see #registerDatanode(DatanodeRegistration) 4540 * @return registration ID 4541 */ 4542 String getRegistrationID() { 4543 return Storage.getRegistrationID(getFSImage().getStorage()); 4544 } 4545 4546 /** 4547 * The given node has reported in. This method should: 4548 * 1) Record the heartbeat, so the datanode isn't timed out 4549 * 2) Adjust usage stats for future block allocation 4550 * 4551 * If a substantial amount of time passed since the last datanode 4552 * heartbeat then request an immediate block report. 4553 * 4554 * @return an array of datanode commands 4555 * @throws IOException 4556 */ 4557 HeartbeatResponse handleHeartbeat(DatanodeRegistration nodeReg, 4558 StorageReport[] reports, long cacheCapacity, long cacheUsed, 4559 int xceiverCount, int xmitsInProgress, int failedVolumes, 4560 VolumeFailureSummary volumeFailureSummary) throws IOException { 4561 readLock(); 4562 try { 4563 //get datanode commands 4564 final int maxTransfer = blockManager.getMaxReplicationStreams() 4565 - xmitsInProgress; 4566 DatanodeCommand[] cmds = blockManager.getDatanodeManager().handleHeartbeat( 4567 nodeReg, reports, blockPoolId, cacheCapacity, cacheUsed, 4568 xceiverCount, maxTransfer, failedVolumes, volumeFailureSummary); 4569 4570 //create ha status 4571 final NNHAStatusHeartbeat haState = new NNHAStatusHeartbeat( 4572 haContext.getState().getServiceState(), 4573 getFSImage().getLastAppliedOrWrittenTxId()); 4574 4575 return new HeartbeatResponse(cmds, haState, rollingUpgradeInfo); 4576 } finally { 4577 readUnlock(); 4578 } 4579 } 4580 4581 /** 4582 * Returns whether or not there were available resources at the last check of 4583 * resources. 4584 * 4585 * @return true if there were sufficient resources available, false otherwise. 4586 */ 4587 boolean nameNodeHasResourcesAvailable() { 4588 return hasResourcesAvailable; 4589 } 4590 4591 /** 4592 * Perform resource checks and cache the results. 4593 */ 4594 void checkAvailableResources() { 4595 Preconditions.checkState(nnResourceChecker != null, 4596 "nnResourceChecker not initialized"); 4597 hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace(); 4598 } 4599 4600 /** 4601 * Persist the block list for the inode. 4602 * @param path 4603 * @param file 4604 * @param logRetryCache 4605 */ 4606 private void persistBlocks(String path, INodeFile file, 4607 boolean logRetryCache) { 4608 assert hasWriteLock(); 4609 Preconditions.checkArgument(file.isUnderConstruction()); 4610 getEditLog().logUpdateBlocks(path, file, logRetryCache); 4611 NameNode.stateChangeLog.debug("persistBlocks: {} with {} blocks is" + 4612 " peristed to the file system", path, file.getBlocks().length); 4613 } 4614 4615 /** 4616 * Close file. 4617 * @param path 4618 * @param file 4619 */ 4620 private void closeFile(String path, INodeFile file) { 4621 assert hasWriteLock(); 4622 waitForLoadingFSImage(); 4623 // file is closed 4624 getEditLog().logCloseFile(path, file); 4625 NameNode.stateChangeLog.debug("closeFile: {} with {} blocks is persisted" + 4626 " to the file system", path, file.getBlocks().length); 4627 } 4628 4629 /** 4630 * Periodically calls hasAvailableResources of NameNodeResourceChecker, and if 4631 * there are found to be insufficient resources available, causes the NN to 4632 * enter safe mode. If resources are later found to have returned to 4633 * acceptable levels, this daemon will cause the NN to exit safe mode. 4634 */ 4635 class NameNodeResourceMonitor implements Runnable { 4636 boolean shouldNNRmRun = true; 4637 @Override 4638 public void run () { 4639 try { 4640 while (fsRunning && shouldNNRmRun) { 4641 checkAvailableResources(); 4642 if(!nameNodeHasResourcesAvailable()) { 4643 String lowResourcesMsg = "NameNode low on available disk space. "; 4644 if (!isInSafeMode()) { 4645 LOG.warn(lowResourcesMsg + "Entering safe mode."); 4646 } else { 4647 LOG.warn(lowResourcesMsg + "Already in safe mode."); 4648 } 4649 enterSafeMode(true); 4650 } 4651 try { 4652 Thread.sleep(resourceRecheckInterval); 4653 } catch (InterruptedException ie) { 4654 // Deliberately ignore 4655 } 4656 } 4657 } catch (Exception e) { 4658 FSNamesystem.LOG.error("Exception in NameNodeResourceMonitor: ", e); 4659 } 4660 } 4661 4662 public void stopMonitor() { 4663 shouldNNRmRun = false; 4664 } 4665 } 4666 4667 class NameNodeEditLogRoller implements Runnable { 4668 4669 private boolean shouldRun = true; 4670 private final long rollThreshold; 4671 private final long sleepIntervalMs; 4672 4673 public NameNodeEditLogRoller(long rollThreshold, int sleepIntervalMs) { 4674 this.rollThreshold = rollThreshold; 4675 this.sleepIntervalMs = sleepIntervalMs; 4676 } 4677 4678 @Override 4679 public void run() { 4680 while (fsRunning && shouldRun) { 4681 try { 4682 FSEditLog editLog = getFSImage().getEditLog(); 4683 long numEdits = 4684 editLog.getLastWrittenTxId() - editLog.getCurSegmentTxId(); 4685 if (numEdits > rollThreshold) { 4686 FSNamesystem.LOG.info("NameNode rolling its own edit log because" 4687 + " number of edits in open segment exceeds threshold of " 4688 + rollThreshold); 4689 rollEditLog(); 4690 } 4691 } catch (Exception e) { 4692 FSNamesystem.LOG.error("Swallowing exception in " 4693 + NameNodeEditLogRoller.class.getSimpleName() + ":", e); 4694 } 4695 try { 4696 Thread.sleep(sleepIntervalMs); 4697 } catch (InterruptedException e) { 4698 FSNamesystem.LOG.info(NameNodeEditLogRoller.class.getSimpleName() 4699 + " was interrupted, exiting"); 4700 break; 4701 } 4702 } 4703 } 4704 4705 public void stop() { 4706 shouldRun = false; 4707 } 4708 } 4709 4710 /** 4711 * Daemon to periodically scan the namespace for lazyPersist files 4712 * with missing blocks and unlink them. 4713 */ 4714 class LazyPersistFileScrubber implements Runnable { 4715 private volatile boolean shouldRun = true; 4716 final int scrubIntervalSec; 4717 public LazyPersistFileScrubber(final int scrubIntervalSec) { 4718 this.scrubIntervalSec = scrubIntervalSec; 4719 } 4720 4721 /** 4722 * Periodically go over the list of lazyPersist files with missing 4723 * blocks and unlink them from the namespace. 4724 */ 4725 private void clearCorruptLazyPersistFiles() 4726 throws IOException { 4727 4728 BlockStoragePolicy lpPolicy = blockManager.getStoragePolicy("LAZY_PERSIST"); 4729 4730 List<BlockCollection> filesToDelete = new ArrayList<>(); 4731 boolean changed = false; 4732 writeLock(); 4733 try { 4734 final Iterator<Block> it = blockManager.getCorruptReplicaBlockIterator(); 4735 4736 while (it.hasNext()) { 4737 Block b = it.next(); 4738 BlockInfoContiguous blockInfo = blockManager.getStoredBlock(b); 4739 if (blockInfo.getBlockCollection().getStoragePolicyID() 4740 == lpPolicy.getId()) { 4741 filesToDelete.add(blockInfo.getBlockCollection()); 4742 } 4743 } 4744 4745 for (BlockCollection bc : filesToDelete) { 4746 LOG.warn("Removing lazyPersist file " + bc.getName() + " with no replicas."); 4747 BlocksMapUpdateInfo toRemoveBlocks = 4748 FSDirDeleteOp.deleteInternal( 4749 FSNamesystem.this, bc.getName(), 4750 INodesInPath.fromINode((INodeFile) bc), false); 4751 changed |= toRemoveBlocks != null; 4752 if (toRemoveBlocks != null) { 4753 removeBlocks(toRemoveBlocks); // Incremental deletion of blocks 4754 } 4755 } 4756 } finally { 4757 writeUnlock(); 4758 } 4759 if (changed) { 4760 getEditLog().logSync(); 4761 } 4762 } 4763 4764 @Override 4765 public void run() { 4766 while (fsRunning && shouldRun) { 4767 try { 4768 clearCorruptLazyPersistFiles(); 4769 Thread.sleep(scrubIntervalSec * 1000); 4770 } catch (InterruptedException e) { 4771 FSNamesystem.LOG.info( 4772 "LazyPersistFileScrubber was interrupted, exiting"); 4773 break; 4774 } catch (Exception e) { 4775 FSNamesystem.LOG.error( 4776 "Ignoring exception in LazyPersistFileScrubber:", e); 4777 } 4778 } 4779 } 4780 4781 public void stop() { 4782 shouldRun = false; 4783 } 4784 } 4785 4786 public FSImage getFSImage() { 4787 return fsImage; 4788 } 4789 4790 public FSEditLog getEditLog() { 4791 return getFSImage().getEditLog(); 4792 } 4793 4794 private void checkBlock(ExtendedBlock block) throws IOException { 4795 if (block != null && !this.blockPoolId.equals(block.getBlockPoolId())) { 4796 throw new IOException("Unexpected BlockPoolId " + block.getBlockPoolId() 4797 + " - expected " + blockPoolId); 4798 } 4799 } 4800 4801 @Metric({"MissingBlocks", "Number of missing blocks"}) 4802 public long getMissingBlocksCount() { 4803 // not locking 4804 return blockManager.getMissingBlocksCount(); 4805 } 4806 4807 @Metric({"MissingReplOneBlocks", "Number of missing blocks " + 4808 "with replication factor 1"}) 4809 public long getMissingReplOneBlocksCount() { 4810 // not locking 4811 return blockManager.getMissingReplOneBlocksCount(); 4812 } 4813 4814 @Metric({"ExpiredHeartbeats", "Number of expired heartbeats"}) 4815 public int getExpiredHeartbeats() { 4816 return datanodeStatistics.getExpiredHeartbeats(); 4817 } 4818 4819 @Metric({"TransactionsSinceLastCheckpoint", 4820 "Number of transactions since last checkpoint"}) 4821 public long getTransactionsSinceLastCheckpoint() { 4822 return getEditLog().getLastWrittenTxId() - 4823 getFSImage().getStorage().getMostRecentCheckpointTxId(); 4824 } 4825 4826 @Metric({"TransactionsSinceLastLogRoll", 4827 "Number of transactions since last edit log roll"}) 4828 public long getTransactionsSinceLastLogRoll() { 4829 if (isInStandbyState() || !getEditLog().isSegmentOpen()) { 4830 return 0; 4831 } else { 4832 return getEditLog().getLastWrittenTxId() - 4833 getEditLog().getCurSegmentTxId() + 1; 4834 } 4835 } 4836 4837 @Metric({"LastWrittenTransactionId", "Transaction ID written to the edit log"}) 4838 public long getLastWrittenTransactionId() { 4839 return getEditLog().getLastWrittenTxId(); 4840 } 4841 4842 @Metric({"LastCheckpointTime", 4843 "Time in milliseconds since the epoch of the last checkpoint"}) 4844 public long getLastCheckpointTime() { 4845 return getFSImage().getStorage().getMostRecentCheckpointTime(); 4846 } 4847 4848 /** @see ClientProtocol#getStats() */ 4849 long[] getStats() { 4850 final long[] stats = datanodeStatistics.getStats(); 4851 stats[ClientProtocol.GET_STATS_UNDER_REPLICATED_IDX] = getUnderReplicatedBlocks(); 4852 stats[ClientProtocol.GET_STATS_CORRUPT_BLOCKS_IDX] = getCorruptReplicaBlocks(); 4853 stats[ClientProtocol.GET_STATS_MISSING_BLOCKS_IDX] = getMissingBlocksCount(); 4854 stats[ClientProtocol.GET_STATS_MISSING_REPL_ONE_BLOCKS_IDX] = 4855 getMissingReplOneBlocksCount(); 4856 return stats; 4857 } 4858 4859 @Override // FSNamesystemMBean 4860 @Metric({"CapacityTotal", 4861 "Total raw capacity of data nodes in bytes"}) 4862 public long getCapacityTotal() { 4863 return datanodeStatistics.getCapacityTotal(); 4864 } 4865 4866 @Metric({"CapacityTotalGB", 4867 "Total raw capacity of data nodes in GB"}) 4868 public float getCapacityTotalGB() { 4869 return DFSUtil.roundBytesToGB(getCapacityTotal()); 4870 } 4871 4872 @Override // FSNamesystemMBean 4873 @Metric({"CapacityUsed", 4874 "Total used capacity across all data nodes in bytes"}) 4875 public long getCapacityUsed() { 4876 return datanodeStatistics.getCapacityUsed(); 4877 } 4878 4879 @Metric({"CapacityUsedGB", 4880 "Total used capacity across all data nodes in GB"}) 4881 public float getCapacityUsedGB() { 4882 return DFSUtil.roundBytesToGB(getCapacityUsed()); 4883 } 4884 4885 @Override // FSNamesystemMBean 4886 @Metric({"CapacityRemaining", "Remaining capacity in bytes"}) 4887 public long getCapacityRemaining() { 4888 return datanodeStatistics.getCapacityRemaining(); 4889 } 4890 4891 @Metric({"CapacityRemainingGB", "Remaining capacity in GB"}) 4892 public float getCapacityRemainingGB() { 4893 return DFSUtil.roundBytesToGB(getCapacityRemaining()); 4894 } 4895 4896 @Metric({"CapacityUsedNonDFS", 4897 "Total space used by data nodes for non DFS purposes in bytes"}) 4898 public long getCapacityUsedNonDFS() { 4899 return datanodeStatistics.getCapacityUsedNonDFS(); 4900 } 4901 4902 /** 4903 * Total number of connections. 4904 */ 4905 @Override // FSNamesystemMBean 4906 @Metric 4907 public int getTotalLoad() { 4908 return datanodeStatistics.getXceiverCount(); 4909 } 4910 4911 @Metric({ "SnapshottableDirectories", "Number of snapshottable directories" }) 4912 public int getNumSnapshottableDirs() { 4913 return this.snapshotManager.getNumSnapshottableDirs(); 4914 } 4915 4916 @Metric({ "Snapshots", "The number of snapshots" }) 4917 public int getNumSnapshots() { 4918 return this.snapshotManager.getNumSnapshots(); 4919 } 4920 4921 @Override 4922 public String getSnapshotStats() { 4923 Map<String, Object> info = new HashMap<String, Object>(); 4924 info.put("SnapshottableDirectories", this.getNumSnapshottableDirs()); 4925 info.put("Snapshots", this.getNumSnapshots()); 4926 return JSON.toString(info); 4927 } 4928 4929 int getNumberOfDatanodes(DatanodeReportType type) { 4930 readLock(); 4931 try { 4932 return getBlockManager().getDatanodeManager().getDatanodeListForReport( 4933 type).size(); 4934 } finally { 4935 readUnlock(); 4936 } 4937 } 4938 4939 DatanodeInfo[] datanodeReport(final DatanodeReportType type 4940 ) throws AccessControlException, StandbyException { 4941 checkSuperuserPrivilege(); 4942 checkOperation(OperationCategory.UNCHECKED); 4943 readLock(); 4944 try { 4945 checkOperation(OperationCategory.UNCHECKED); 4946 final DatanodeManager dm = getBlockManager().getDatanodeManager(); 4947 final List<DatanodeDescriptor> results = dm.getDatanodeListForReport(type); 4948 4949 DatanodeInfo[] arr = new DatanodeInfo[results.size()]; 4950 for (int i=0; i<arr.length; i++) { 4951 arr[i] = new DatanodeInfo(results.get(i)); 4952 } 4953 return arr; 4954 } finally { 4955 readUnlock(); 4956 } 4957 } 4958 4959 DatanodeStorageReport[] getDatanodeStorageReport(final DatanodeReportType type 4960 ) throws AccessControlException, StandbyException { 4961 checkSuperuserPrivilege(); 4962 checkOperation(OperationCategory.UNCHECKED); 4963 readLock(); 4964 try { 4965 checkOperation(OperationCategory.UNCHECKED); 4966 final DatanodeManager dm = getBlockManager().getDatanodeManager(); 4967 final List<DatanodeDescriptor> datanodes = dm.getDatanodeListForReport(type); 4968 4969 DatanodeStorageReport[] reports = new DatanodeStorageReport[datanodes.size()]; 4970 for (int i = 0; i < reports.length; i++) { 4971 final DatanodeDescriptor d = datanodes.get(i); 4972 reports[i] = new DatanodeStorageReport(new DatanodeInfo(d), 4973 d.getStorageReports()); 4974 } 4975 return reports; 4976 } finally { 4977 readUnlock(); 4978 } 4979 } 4980 4981 /** 4982 * Save namespace image. 4983 * This will save current namespace into fsimage file and empty edits file. 4984 * Requires superuser privilege and safe mode. 4985 * 4986 * @throws AccessControlException if superuser privilege is violated. 4987 * @throws IOException if 4988 */ 4989 void saveNamespace() throws AccessControlException, IOException { 4990 checkOperation(OperationCategory.UNCHECKED); 4991 checkSuperuserPrivilege(); 4992 4993 cpLock(); // Block if a checkpointing is in progress on standby. 4994 readLock(); 4995 try { 4996 checkOperation(OperationCategory.UNCHECKED); 4997 4998 if (!isInSafeMode()) { 4999 throw new IOException("Safe mode should be turned ON " 5000 + "in order to create namespace image."); 5001 } 5002 getFSImage().saveNamespace(this); 5003 } finally { 5004 readUnlock(); 5005 cpUnlock(); 5006 } 5007 LOG.info("New namespace image has been created"); 5008 } 5009 5010 /** 5011 * Enables/Disables/Checks restoring failed storage replicas if the storage becomes available again. 5012 * Requires superuser privilege. 5013 * 5014 * @throws AccessControlException if superuser privilege is violated. 5015 */ 5016 boolean restoreFailedStorage(String arg) throws AccessControlException, 5017 StandbyException { 5018 checkSuperuserPrivilege(); 5019 checkOperation(OperationCategory.UNCHECKED); 5020 cpLock(); // Block if a checkpointing is in progress on standby. 5021 writeLock(); 5022 try { 5023 checkOperation(OperationCategory.UNCHECKED); 5024 5025 // if it is disabled - enable it and vice versa. 5026 if(arg.equals("check")) 5027 return getFSImage().getStorage().getRestoreFailedStorage(); 5028 5029 boolean val = arg.equals("true"); // false if not 5030 getFSImage().getStorage().setRestoreFailedStorage(val); 5031 5032 return val; 5033 } finally { 5034 writeUnlock(); 5035 cpUnlock(); 5036 } 5037 } 5038 5039 Date getStartTime() { 5040 return new Date(startTime); 5041 } 5042 5043 void finalizeUpgrade() throws IOException { 5044 checkSuperuserPrivilege(); 5045 checkOperation(OperationCategory.UNCHECKED); 5046 cpLock(); // Block if a checkpointing is in progress on standby. 5047 writeLock(); 5048 try { 5049 checkOperation(OperationCategory.UNCHECKED); 5050 getFSImage().finalizeUpgrade(this.isHaEnabled() && inActiveState()); 5051 } finally { 5052 writeUnlock(); 5053 cpUnlock(); 5054 } 5055 } 5056 5057 void refreshNodes() throws IOException { 5058 checkOperation(OperationCategory.UNCHECKED); 5059 checkSuperuserPrivilege(); 5060 getBlockManager().getDatanodeManager().refreshNodes(new HdfsConfiguration()); 5061 } 5062 5063 void setBalancerBandwidth(long bandwidth) throws IOException { 5064 checkOperation(OperationCategory.UNCHECKED); 5065 checkSuperuserPrivilege(); 5066 getBlockManager().getDatanodeManager().setBalancerBandwidth(bandwidth); 5067 } 5068 5069 /** 5070 * Persist the new block (the last block of the given file). 5071 * @param path 5072 * @param file 5073 */ 5074 private void persistNewBlock(String path, INodeFile file) { 5075 Preconditions.checkArgument(file.isUnderConstruction()); 5076 getEditLog().logAddBlock(path, file); 5077 NameNode.stateChangeLog.debug("persistNewBlock: {} with new block {}," + 5078 " current total block count is {}", path, 5079 file.getLastBlock().toString(), file.getBlocks().length); 5080 } 5081 5082 /** 5083 * SafeModeInfo contains information related to the safe mode. 5084 * <p> 5085 * An instance of {@link SafeModeInfo} is created when the name node 5086 * enters safe mode. 5087 * <p> 5088 * During name node startup {@link SafeModeInfo} counts the number of 5089 * <em>safe blocks</em>, those that have at least the minimal number of 5090 * replicas, and calculates the ratio of safe blocks to the total number 5091 * of blocks in the system, which is the size of blocks in 5092 * {@link FSNamesystem#blockManager}. When the ratio reaches the 5093 * {@link #threshold} it starts the SafeModeMonitor daemon in order 5094 * to monitor whether the safe mode {@link #extension} is passed. 5095 * Then it leaves safe mode and destroys itself. 5096 * <p> 5097 * If safe mode is turned on manually then the number of safe blocks is 5098 * not tracked because the name node is not intended to leave safe mode 5099 * automatically in the case. 5100 * 5101 * @see ClientProtocol#setSafeMode(HdfsConstants.SafeModeAction, boolean) 5102 */ 5103 public class SafeModeInfo { 5104 // configuration fields 5105 /** Safe mode threshold condition %.*/ 5106 private final double threshold; 5107 /** Safe mode minimum number of datanodes alive */ 5108 private final int datanodeThreshold; 5109 /** 5110 * Safe mode extension after the threshold. 5111 * Make it volatile so that getSafeModeTip can read the latest value 5112 * without taking a lock. 5113 */ 5114 private volatile int extension; 5115 /** Min replication required by safe mode. */ 5116 private final int safeReplication; 5117 /** threshold for populating needed replication queues */ 5118 private final double replQueueThreshold; 5119 // internal fields 5120 /** Time when threshold was reached. 5121 * <br> -1 safe mode is off 5122 * <br> 0 safe mode is on, and threshold is not reached yet 5123 * <br> >0 safe mode is on, but we are in extension period 5124 */ 5125 private long reached = -1; 5126 private long reachedTimestamp = -1; 5127 /** Total number of blocks. */ 5128 int blockTotal; 5129 /** Number of safe blocks. */ 5130 int blockSafe; 5131 /** Number of blocks needed to satisfy safe mode threshold condition */ 5132 private int blockThreshold; 5133 /** Number of blocks needed before populating replication queues */ 5134 private int blockReplQueueThreshold; 5135 /** time of the last status printout */ 5136 private long lastStatusReport = 0; 5137 /** 5138 * Was safemode entered automatically because available resources were low. 5139 * Make it volatile so that getSafeModeTip can read the latest value 5140 * without taking a lock. 5141 */ 5142 private volatile boolean resourcesLow = false; 5143 /** Should safemode adjust its block totals as blocks come in */ 5144 private boolean shouldIncrementallyTrackBlocks = false; 5145 /** counter for tracking startup progress of reported blocks */ 5146 private Counter awaitingReportedBlocksCounter; 5147 5148 /** 5149 * Creates SafeModeInfo when the name node enters 5150 * automatic safe mode at startup. 5151 * 5152 * @param conf configuration 5153 */ 5154 private SafeModeInfo(Configuration conf) { 5155 this.threshold = conf.getFloat(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY, 5156 DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT); 5157 if(threshold > 1.0) { 5158 LOG.warn("The threshold value should't be greater than 1, threshold: " + threshold); 5159 } 5160 this.datanodeThreshold = conf.getInt( 5161 DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY, 5162 DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT); 5163 this.extension = conf.getInt(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 0); 5164 this.safeReplication = conf.getInt(DFS_NAMENODE_REPLICATION_MIN_KEY, 5165 DFS_NAMENODE_REPLICATION_MIN_DEFAULT); 5166 5167 LOG.info(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY + " = " + threshold); 5168 LOG.info(DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY + " = " + datanodeThreshold); 5169 LOG.info(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY + " = " + extension); 5170 5171 // default to safe mode threshold (i.e., don't populate queues before leaving safe mode) 5172 this.replQueueThreshold = 5173 conf.getFloat(DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY, 5174 (float) threshold); 5175 this.blockTotal = 0; 5176 this.blockSafe = 0; 5177 } 5178 5179 /** 5180 * In the HA case, the StandbyNode can be in safemode while the namespace 5181 * is modified by the edit log tailer. In this case, the number of total 5182 * blocks changes as edits are processed (eg blocks are added and deleted). 5183 * However, we don't want to do the incremental tracking during the 5184 * startup-time loading process -- only once the initial total has been 5185 * set after the image has been loaded. 5186 */ 5187 private boolean shouldIncrementallyTrackBlocks() { 5188 return shouldIncrementallyTrackBlocks; 5189 } 5190 5191 /** 5192 * Creates SafeModeInfo when safe mode is entered manually, or because 5193 * available resources are low. 5194 * 5195 * The {@link #threshold} is set to 1.5 so that it could never be reached. 5196 * {@link #blockTotal} is set to -1 to indicate that safe mode is manual. 5197 * 5198 * @see SafeModeInfo 5199 */ 5200 private SafeModeInfo(boolean resourcesLow) { 5201 this.threshold = 1.5f; // this threshold can never be reached 5202 this.datanodeThreshold = Integer.MAX_VALUE; 5203 this.extension = Integer.MAX_VALUE; 5204 this.safeReplication = Short.MAX_VALUE + 1; // more than maxReplication 5205 this.replQueueThreshold = 1.5f; // can never be reached 5206 this.blockTotal = -1; 5207 this.blockSafe = -1; 5208 this.resourcesLow = resourcesLow; 5209 enter(); 5210 reportStatus("STATE* Safe mode is ON.", true); 5211 } 5212 5213 /** 5214 * Check if safe mode is on. 5215 * @return true if in safe mode 5216 */ 5217 private synchronized boolean isOn() { 5218 doConsistencyCheck(); 5219 return this.reached >= 0; 5220 } 5221 5222 /** 5223 * Enter safe mode. 5224 */ 5225 private void enter() { 5226 this.reached = 0; 5227 this.reachedTimestamp = 0; 5228 } 5229 5230 /** 5231 * Leave safe mode. 5232 * <p> 5233 * Check for invalid, under- & over-replicated blocks in the end of startup. 5234 */ 5235 private synchronized void leave() { 5236 // if not done yet, initialize replication queues. 5237 // In the standby, do not populate repl queues 5238 if (!isPopulatingReplQueues() && shouldPopulateReplQueues()) { 5239 initializeReplQueues(); 5240 } 5241 long timeInSafemode = now() - startTime; 5242 NameNode.stateChangeLog.info("STATE* Leaving safe mode after " 5243 + timeInSafemode/1000 + " secs"); 5244 NameNode.getNameNodeMetrics().setSafeModeTime((int) timeInSafemode); 5245 5246 //Log the following only once (when transitioning from ON -> OFF) 5247 if (reached >= 0) { 5248 NameNode.stateChangeLog.info("STATE* Safe mode is OFF"); 5249 } 5250 reached = -1; 5251 reachedTimestamp = -1; 5252 safeMode = null; 5253 final NetworkTopology nt = blockManager.getDatanodeManager().getNetworkTopology(); 5254 NameNode.stateChangeLog.info("STATE* Network topology has " 5255 + nt.getNumOfRacks() + " racks and " 5256 + nt.getNumOfLeaves() + " datanodes"); 5257 NameNode.stateChangeLog.info("STATE* UnderReplicatedBlocks has " 5258 + blockManager.numOfUnderReplicatedBlocks() + " blocks"); 5259 5260 startSecretManagerIfNecessary(); 5261 5262 // If startup has not yet completed, end safemode phase. 5263 StartupProgress prog = NameNode.getStartupProgress(); 5264 if (prog.getStatus(Phase.SAFEMODE) != Status.COMPLETE) { 5265 prog.endStep(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS); 5266 prog.endPhase(Phase.SAFEMODE); 5267 } 5268 } 5269 5270 /** 5271 * Check whether we have reached the threshold for 5272 * initializing replication queues. 5273 */ 5274 private synchronized boolean canInitializeReplQueues() { 5275 return shouldPopulateReplQueues() 5276 && blockSafe >= blockReplQueueThreshold; 5277 } 5278 5279 /** 5280 * Safe mode can be turned off iff 5281 * the threshold is reached and 5282 * the extension time have passed. 5283 * @return true if can leave or false otherwise. 5284 */ 5285 private synchronized boolean canLeave() { 5286 if (reached == 0) { 5287 return false; 5288 } 5289 5290 if (monotonicNow() - reached < extension) { 5291 reportStatus("STATE* Safe mode ON, in safe mode extension.", false); 5292 return false; 5293 } 5294 5295 if (needEnter()) { 5296 reportStatus("STATE* Safe mode ON, thresholds not met.", false); 5297 return false; 5298 } 5299 5300 return true; 5301 } 5302 5303 /** 5304 * There is no need to enter safe mode 5305 * if DFS is empty or {@link #threshold} == 0 5306 */ 5307 private boolean needEnter() { 5308 return (threshold != 0 && blockSafe < blockThreshold) || 5309 (datanodeThreshold != 0 && getNumLiveDataNodes() < datanodeThreshold) || 5310 (!nameNodeHasResourcesAvailable()); 5311 } 5312 5313 /** 5314 * Check and trigger safe mode if needed. 5315 */ 5316 private void checkMode() { 5317 // Have to have write-lock since leaving safemode initializes 5318 // repl queues, which requires write lock 5319 assert hasWriteLock(); 5320 if (inTransitionToActive()) { 5321 return; 5322 } 5323 // if smmthread is already running, the block threshold must have been 5324 // reached before, there is no need to enter the safe mode again 5325 if (smmthread == null && needEnter()) { 5326 enter(); 5327 // check if we are ready to initialize replication queues 5328 if (canInitializeReplQueues() && !isPopulatingReplQueues() 5329 && !haEnabled) { 5330 initializeReplQueues(); 5331 } 5332 reportStatus("STATE* Safe mode ON.", false); 5333 return; 5334 } 5335 // the threshold is reached or was reached before 5336 if (!isOn() || // safe mode is off 5337 extension <= 0 || threshold <= 0) { // don't need to wait 5338 this.leave(); // leave safe mode 5339 return; 5340 } 5341 if (reached > 0) { // threshold has already been reached before 5342 reportStatus("STATE* Safe mode ON.", false); 5343 return; 5344 } 5345 // start monitor 5346 reached = monotonicNow(); 5347 reachedTimestamp = now(); 5348 if (smmthread == null) { 5349 smmthread = new Daemon(new SafeModeMonitor()); 5350 smmthread.start(); 5351 reportStatus("STATE* Safe mode extension entered.", true); 5352 } 5353 5354 // check if we are ready to initialize replication queues 5355 if (canInitializeReplQueues() && !isPopulatingReplQueues() && !haEnabled) { 5356 initializeReplQueues(); 5357 } 5358 } 5359 5360 /** 5361 * Set total number of blocks. 5362 */ 5363 private synchronized void setBlockTotal(int total) { 5364 this.blockTotal = total; 5365 this.blockThreshold = (int) (blockTotal * threshold); 5366 this.blockReplQueueThreshold = 5367 (int) (blockTotal * replQueueThreshold); 5368 if (haEnabled) { 5369 // After we initialize the block count, any further namespace 5370 // modifications done while in safe mode need to keep track 5371 // of the number of total blocks in the system. 5372 this.shouldIncrementallyTrackBlocks = true; 5373 } 5374 if(blockSafe < 0) 5375 this.blockSafe = 0; 5376 checkMode(); 5377 } 5378 5379 /** 5380 * Increment number of safe blocks if current block has 5381 * reached minimal replication. 5382 * @param replication current replication 5383 */ 5384 private synchronized void incrementSafeBlockCount(short replication) { 5385 if (replication == safeReplication) { 5386 this.blockSafe++; 5387 5388 // Report startup progress only if we haven't completed startup yet. 5389 StartupProgress prog = NameNode.getStartupProgress(); 5390 if (prog.getStatus(Phase.SAFEMODE) != Status.COMPLETE) { 5391 if (this.awaitingReportedBlocksCounter == null) { 5392 this.awaitingReportedBlocksCounter = prog.getCounter(Phase.SAFEMODE, 5393 STEP_AWAITING_REPORTED_BLOCKS); 5394 } 5395 this.awaitingReportedBlocksCounter.increment(); 5396 } 5397 5398 checkMode(); 5399 } 5400 } 5401 5402 /** 5403 * Decrement number of safe blocks if current block has 5404 * fallen below minimal replication. 5405 * @param replication current replication 5406 */ 5407 private synchronized void decrementSafeBlockCount(short replication) { 5408 if (replication == safeReplication-1) { 5409 this.blockSafe--; 5410 //blockSafe is set to -1 in manual / low resources safemode 5411 assert blockSafe >= 0 || isManual() || areResourcesLow(); 5412 checkMode(); 5413 } 5414 } 5415 5416 /** 5417 * Check if safe mode was entered manually 5418 */ 5419 private boolean isManual() { 5420 return extension == Integer.MAX_VALUE; 5421 } 5422 5423 /** 5424 * Set manual safe mode. 5425 */ 5426 private synchronized void setManual() { 5427 extension = Integer.MAX_VALUE; 5428 } 5429 5430 /** 5431 * Check if safe mode was entered due to resources being low. 5432 */ 5433 private boolean areResourcesLow() { 5434 return resourcesLow; 5435 } 5436 5437 /** 5438 * Set that resources are low for this instance of safe mode. 5439 */ 5440 private void setResourcesLow() { 5441 resourcesLow = true; 5442 } 5443 5444 /** 5445 * A tip on how safe mode is to be turned off: manually or automatically. 5446 */ 5447 String getTurnOffTip() { 5448 if(!isOn()) { 5449 return "Safe mode is OFF."; 5450 } 5451 5452 //Manual OR low-resource safemode. (Admin intervention required) 5453 String adminMsg = "It was turned on manually. "; 5454 if (areResourcesLow()) { 5455 adminMsg = "Resources are low on NN. Please add or free up more " 5456 + "resources then turn off safe mode manually. NOTE: If you turn off" 5457 + " safe mode before adding resources, " 5458 + "the NN will immediately return to safe mode. "; 5459 } 5460 if (isManual() || areResourcesLow()) { 5461 return adminMsg 5462 + "Use \"hdfs dfsadmin -safemode leave\" to turn safe mode off."; 5463 } 5464 5465 boolean thresholdsMet = true; 5466 int numLive = getNumLiveDataNodes(); 5467 String msg = ""; 5468 if (blockSafe < blockThreshold) { 5469 msg += String.format( 5470 "The reported blocks %d needs additional %d" 5471 + " blocks to reach the threshold %.4f of total blocks %d.%n", 5472 blockSafe, (blockThreshold - blockSafe) + 1, threshold, blockTotal); 5473 thresholdsMet = false; 5474 } else { 5475 msg += String.format("The reported blocks %d has reached the threshold" 5476 + " %.4f of total blocks %d. ", blockSafe, threshold, blockTotal); 5477 } 5478 if (numLive < datanodeThreshold) { 5479 msg += String.format( 5480 "The number of live datanodes %d needs an additional %d live " 5481 + "datanodes to reach the minimum number %d.%n", 5482 numLive, (datanodeThreshold - numLive), datanodeThreshold); 5483 thresholdsMet = false; 5484 } else { 5485 msg += String.format("The number of live datanodes %d has reached " 5486 + "the minimum number %d. ", 5487 numLive, datanodeThreshold); 5488 } 5489 msg += (reached > 0) ? "In safe mode extension. " : ""; 5490 msg += "Safe mode will be turned off automatically "; 5491 5492 if (!thresholdsMet) { 5493 msg += "once the thresholds have been reached."; 5494 } else if (reached + extension - monotonicNow() > 0) { 5495 msg += ("in " + (reached + extension - monotonicNow()) / 1000 + " seconds."); 5496 } else { 5497 msg += "soon."; 5498 } 5499 5500 return msg; 5501 } 5502 5503 /** 5504 * Print status every 20 seconds. 5505 */ 5506 private void reportStatus(String msg, boolean rightNow) { 5507 long curTime = now(); 5508 if(!rightNow && (curTime - lastStatusReport < 20 * 1000)) 5509 return; 5510 NameNode.stateChangeLog.info(msg + " \n" + getTurnOffTip()); 5511 lastStatusReport = curTime; 5512 } 5513 5514 @Override 5515 public String toString() { 5516 String resText = "Current safe blocks = " 5517 + blockSafe 5518 + ". Target blocks = " + blockThreshold + " for threshold = %" + threshold 5519 + ". Minimal replication = " + safeReplication + "."; 5520 if (reached > 0) 5521 resText += " Threshold was reached " + new Date(reachedTimestamp) + "."; 5522 return resText; 5523 } 5524 5525 /** 5526 * Checks consistency of the class state. 5527 * This is costly so only runs if asserts are enabled. 5528 */ 5529 private void doConsistencyCheck() { 5530 boolean assertsOn = false; 5531 assert assertsOn = true; // set to true if asserts are on 5532 if (!assertsOn) return; 5533 5534 if (blockTotal == -1 && blockSafe == -1) { 5535 return; // manual safe mode 5536 } 5537 int activeBlocks = blockManager.getActiveBlockCount(); 5538 if ((blockTotal != activeBlocks) && 5539 !(blockSafe >= 0 && blockSafe <= blockTotal)) { 5540 throw new AssertionError( 5541 " SafeMode: Inconsistent filesystem state: " 5542 + "SafeMode data: blockTotal=" + blockTotal 5543 + " blockSafe=" + blockSafe + "; " 5544 + "BlockManager data: active=" + activeBlocks); 5545 } 5546 } 5547 5548 private synchronized void adjustBlockTotals(int deltaSafe, int deltaTotal) { 5549 if (!shouldIncrementallyTrackBlocks) { 5550 return; 5551 } 5552 assert haEnabled; 5553 5554 if (LOG.isDebugEnabled()) { 5555 LOG.debug("Adjusting block totals from " + 5556 blockSafe + "/" + blockTotal + " to " + 5557 (blockSafe + deltaSafe) + "/" + (blockTotal + deltaTotal)); 5558 } 5559 assert blockSafe + deltaSafe >= 0 : "Can't reduce blockSafe " + 5560 blockSafe + " by " + deltaSafe + ": would be negative"; 5561 assert blockTotal + deltaTotal >= 0 : "Can't reduce blockTotal " + 5562 blockTotal + " by " + deltaTotal + ": would be negative"; 5563 5564 blockSafe += deltaSafe; 5565 setBlockTotal(blockTotal + deltaTotal); 5566 } 5567 } 5568 5569 /** 5570 * Periodically check whether it is time to leave safe mode. 5571 * This thread starts when the threshold level is reached. 5572 * 5573 */ 5574 class SafeModeMonitor implements Runnable { 5575 /** interval in msec for checking safe mode: {@value} */ 5576 private static final long recheckInterval = 1000; 5577 5578 /** 5579 */ 5580 @Override 5581 public void run() { 5582 while (fsRunning) { 5583 writeLock(); 5584 try { 5585 if (safeMode == null) { // Not in safe mode. 5586 break; 5587 } 5588 if (safeMode.canLeave()) { 5589 // Leave safe mode. 5590 safeMode.leave(); 5591 smmthread = null; 5592 break; 5593 } 5594 } finally { 5595 writeUnlock(); 5596 } 5597 5598 try { 5599 Thread.sleep(recheckInterval); 5600 } catch (InterruptedException ie) { 5601 // Ignored 5602 } 5603 } 5604 if (!fsRunning) { 5605 LOG.info("NameNode is being shutdown, exit SafeModeMonitor thread"); 5606 } 5607 } 5608 } 5609 5610 boolean setSafeMode(SafeModeAction action) throws IOException { 5611 if (action != SafeModeAction.SAFEMODE_GET) { 5612 checkSuperuserPrivilege(); 5613 switch(action) { 5614 case SAFEMODE_LEAVE: // leave safe mode 5615 leaveSafeMode(); 5616 break; 5617 case SAFEMODE_ENTER: // enter safe mode 5618 enterSafeMode(false); 5619 break; 5620 default: 5621 LOG.error("Unexpected safe mode action"); 5622 } 5623 } 5624 return isInSafeMode(); 5625 } 5626 5627 @Override 5628 public void checkSafeMode() { 5629 // safeMode is volatile, and may be set to null at any time 5630 SafeModeInfo safeMode = this.safeMode; 5631 if (safeMode != null) { 5632 safeMode.checkMode(); 5633 } 5634 } 5635 5636 @Override 5637 public boolean isInSafeMode() { 5638 // safeMode is volatile, and may be set to null at any time 5639 SafeModeInfo safeMode = this.safeMode; 5640 if (safeMode == null) 5641 return false; 5642 return safeMode.isOn(); 5643 } 5644 5645 @Override 5646 public boolean isInStartupSafeMode() { 5647 // safeMode is volatile, and may be set to null at any time 5648 SafeModeInfo safeMode = this.safeMode; 5649 if (safeMode == null) 5650 return false; 5651 // If the NN is in safemode, and not due to manual / low resources, we 5652 // assume it must be because of startup. If the NN had low resources during 5653 // startup, we assume it came out of startup safemode and it is now in low 5654 // resources safemode 5655 return !safeMode.isManual() && !safeMode.areResourcesLow() 5656 && safeMode.isOn(); 5657 } 5658 5659 /** 5660 * Check if replication queues are to be populated 5661 * @return true when node is HAState.Active and not in the very first safemode 5662 */ 5663 @Override 5664 public boolean isPopulatingReplQueues() { 5665 if (!shouldPopulateReplQueues()) { 5666 return false; 5667 } 5668 return initializedReplQueues; 5669 } 5670 5671 private boolean shouldPopulateReplQueues() { 5672 if(haContext == null || haContext.getState() == null) 5673 return false; 5674 return haContext.getState().shouldPopulateReplQueues(); 5675 } 5676 5677 @Override 5678 public void incrementSafeBlockCount(int replication) { 5679 // safeMode is volatile, and may be set to null at any time 5680 SafeModeInfo safeMode = this.safeMode; 5681 if (safeMode == null) 5682 return; 5683 safeMode.incrementSafeBlockCount((short)replication); 5684 } 5685 5686 @Override 5687 public void decrementSafeBlockCount(Block b) { 5688 // safeMode is volatile, and may be set to null at any time 5689 SafeModeInfo safeMode = this.safeMode; 5690 if (safeMode == null) // mostly true 5691 return; 5692 BlockInfoContiguous storedBlock = getStoredBlock(b); 5693 if (storedBlock.isComplete()) { 5694 safeMode.decrementSafeBlockCount((short)blockManager.countNodes(b).liveReplicas()); 5695 } 5696 } 5697 5698 /** 5699 * Adjust the total number of blocks safe and expected during safe mode. 5700 * If safe mode is not currently on, this is a no-op. 5701 * @param deltaSafe the change in number of safe blocks 5702 * @param deltaTotal the change i nnumber of total blocks expected 5703 */ 5704 @Override 5705 public void adjustSafeModeBlockTotals(int deltaSafe, int deltaTotal) { 5706 // safeMode is volatile, and may be set to null at any time 5707 SafeModeInfo safeMode = this.safeMode; 5708 if (safeMode == null) 5709 return; 5710 safeMode.adjustBlockTotals(deltaSafe, deltaTotal); 5711 } 5712 5713 /** 5714 * Set the total number of blocks in the system. 5715 */ 5716 public void setBlockTotal() { 5717 // safeMode is volatile, and may be set to null at any time 5718 SafeModeInfo safeMode = this.safeMode; 5719 if (safeMode == null) 5720 return; 5721 safeMode.setBlockTotal((int)getCompleteBlocksTotal()); 5722 } 5723 5724 /** 5725 * Get the total number of blocks in the system. 5726 */ 5727 @Override // FSNamesystemMBean 5728 @Metric 5729 public long getBlocksTotal() { 5730 return blockManager.getTotalBlocks(); 5731 } 5732 5733 /** 5734 * Get the total number of COMPLETE blocks in the system. 5735 * For safe mode only complete blocks are counted. 5736 */ 5737 private long getCompleteBlocksTotal() { 5738 // Calculate number of blocks under construction 5739 long numUCBlocks = 0; 5740 readLock(); 5741 numUCBlocks = leaseManager.getNumUnderConstructionBlocks(); 5742 try { 5743 return getBlocksTotal() - numUCBlocks; 5744 } finally { 5745 readUnlock(); 5746 } 5747 } 5748 5749 /** 5750 * Enter safe mode. If resourcesLow is false, then we assume it is manual 5751 * @throws IOException 5752 */ 5753 void enterSafeMode(boolean resourcesLow) throws IOException { 5754 writeLock(); 5755 try { 5756 // Stop the secret manager, since rolling the master key would 5757 // try to write to the edit log 5758 stopSecretManager(); 5759 5760 // Ensure that any concurrent operations have been fully synced 5761 // before entering safe mode. This ensures that the FSImage 5762 // is entirely stable on disk as soon as we're in safe mode. 5763 boolean isEditlogOpenForWrite = getEditLog().isOpenForWrite(); 5764 // Before Editlog is in OpenForWrite mode, editLogStream will be null. So, 5765 // logSyncAll call can be called only when Edlitlog is in OpenForWrite mode 5766 if (isEditlogOpenForWrite) { 5767 getEditLog().logSyncAll(); 5768 } 5769 if (!isInSafeMode()) { 5770 safeMode = new SafeModeInfo(resourcesLow); 5771 return; 5772 } 5773 if (resourcesLow) { 5774 safeMode.setResourcesLow(); 5775 } else { 5776 safeMode.setManual(); 5777 } 5778 if (isEditlogOpenForWrite) { 5779 getEditLog().logSyncAll(); 5780 } 5781 NameNode.stateChangeLog.info("STATE* Safe mode is ON" 5782 + safeMode.getTurnOffTip()); 5783 } finally { 5784 writeUnlock(); 5785 } 5786 } 5787 5788 /** 5789 * Leave safe mode. 5790 */ 5791 void leaveSafeMode() { 5792 writeLock(); 5793 try { 5794 if (!isInSafeMode()) { 5795 NameNode.stateChangeLog.info("STATE* Safe mode is already OFF"); 5796 return; 5797 } 5798 safeMode.leave(); 5799 } finally { 5800 writeUnlock(); 5801 } 5802 } 5803 5804 String getSafeModeTip() { 5805 // There is no need to take readLock. 5806 // Don't use isInSafeMode as this.safeMode might be set to null. 5807 // after isInSafeMode returns. 5808 boolean inSafeMode; 5809 SafeModeInfo safeMode = this.safeMode; 5810 if (safeMode == null) { 5811 inSafeMode = false; 5812 } else { 5813 inSafeMode = safeMode.isOn(); 5814 } 5815 5816 if (!inSafeMode) { 5817 return ""; 5818 } else { 5819 return safeMode.getTurnOffTip(); 5820 } 5821 } 5822 5823 CheckpointSignature rollEditLog() throws IOException { 5824 checkSuperuserPrivilege(); 5825 checkOperation(OperationCategory.JOURNAL); 5826 writeLock(); 5827 try { 5828 checkOperation(OperationCategory.JOURNAL); 5829 checkNameNodeSafeMode("Log not rolled"); 5830 if (Server.isRpcInvocation()) { 5831 LOG.info("Roll Edit Log from " + Server.getRemoteAddress()); 5832 } 5833 return getFSImage().rollEditLog(); 5834 } finally { 5835 writeUnlock(); 5836 } 5837 } 5838 5839 NamenodeCommand startCheckpoint(NamenodeRegistration backupNode, 5840 NamenodeRegistration activeNamenode) throws IOException { 5841 checkOperation(OperationCategory.CHECKPOINT); 5842 writeLock(); 5843 try { 5844 checkOperation(OperationCategory.CHECKPOINT); 5845 checkNameNodeSafeMode("Checkpoint not started"); 5846 5847 LOG.info("Start checkpoint for " + backupNode.getAddress()); 5848 NamenodeCommand cmd = getFSImage().startCheckpoint(backupNode, 5849 activeNamenode); 5850 getEditLog().logSync(); 5851 return cmd; 5852 } finally { 5853 writeUnlock(); 5854 } 5855 } 5856 5857 public void processIncrementalBlockReport(final DatanodeID nodeID, 5858 final StorageReceivedDeletedBlocks srdb) 5859 throws IOException { 5860 writeLock(); 5861 try { 5862 blockManager.processIncrementalBlockReport(nodeID, srdb); 5863 } finally { 5864 writeUnlock(); 5865 } 5866 } 5867 5868 void endCheckpoint(NamenodeRegistration registration, 5869 CheckpointSignature sig) throws IOException { 5870 checkOperation(OperationCategory.CHECKPOINT); 5871 readLock(); 5872 try { 5873 checkOperation(OperationCategory.CHECKPOINT); 5874 checkNameNodeSafeMode("Checkpoint not ended"); 5875 LOG.info("End checkpoint for " + registration.getAddress()); 5876 getFSImage().endCheckpoint(sig); 5877 } finally { 5878 readUnlock(); 5879 } 5880 } 5881 5882 PermissionStatus createFsOwnerPermissions(FsPermission permission) { 5883 return new PermissionStatus(fsOwner.getShortUserName(), supergroup, permission); 5884 } 5885 5886 private void checkUnreadableBySuperuser(FSPermissionChecker pc, 5887 INode inode, int snapshotId) 5888 throws IOException { 5889 if (pc.isSuperUser()) { 5890 for (XAttr xattr : FSDirXAttrOp.getXAttrs(dir, inode, snapshotId)) { 5891 if (XAttrHelper.getPrefixName(xattr). 5892 equals(SECURITY_XATTR_UNREADABLE_BY_SUPERUSER)) { 5893 throw new AccessControlException("Access is denied for " + 5894 pc.getUser() + " since the superuser is not allowed to " + 5895 "perform this operation."); 5896 } 5897 } 5898 } 5899 } 5900 5901 @Override 5902 public void checkSuperuserPrivilege() 5903 throws AccessControlException { 5904 if (isPermissionEnabled) { 5905 FSPermissionChecker pc = getPermissionChecker(); 5906 pc.checkSuperuserPrivilege(); 5907 } 5908 } 5909 5910 /** 5911 * Check to see if we have exceeded the limit on the number 5912 * of inodes. 5913 */ 5914 void checkFsObjectLimit() throws IOException { 5915 if (maxFsObjects != 0 && 5916 maxFsObjects <= dir.totalInodes() + getBlocksTotal()) { 5917 throw new IOException("Exceeded the configured number of objects " + 5918 maxFsObjects + " in the filesystem."); 5919 } 5920 } 5921 5922 /** 5923 * Get the total number of objects in the system. 5924 */ 5925 @Override // FSNamesystemMBean 5926 public long getMaxObjects() { 5927 return maxFsObjects; 5928 } 5929 5930 @Override // FSNamesystemMBean 5931 @Metric 5932 public long getFilesTotal() { 5933 // There is no need to take fSNamesystem's lock as 5934 // FSDirectory has its own lock. 5935 return this.dir.totalInodes(); 5936 } 5937 5938 @Override // FSNamesystemMBean 5939 @Metric 5940 public long getPendingReplicationBlocks() { 5941 return blockManager.getPendingReplicationBlocksCount(); 5942 } 5943 5944 @Override // FSNamesystemMBean 5945 @Metric 5946 public long getUnderReplicatedBlocks() { 5947 return blockManager.getUnderReplicatedBlocksCount(); 5948 } 5949 5950 /** Returns number of blocks with corrupt replicas */ 5951 @Metric({"CorruptBlocks", "Number of blocks with corrupt replicas"}) 5952 public long getCorruptReplicaBlocks() { 5953 return blockManager.getCorruptReplicaBlocksCount(); 5954 } 5955 5956 @Override // FSNamesystemMBean 5957 @Metric 5958 public long getScheduledReplicationBlocks() { 5959 return blockManager.getScheduledReplicationBlocksCount(); 5960 } 5961 5962 @Override 5963 @Metric 5964 public long getPendingDeletionBlocks() { 5965 return blockManager.getPendingDeletionBlocksCount(); 5966 } 5967 5968 @Override 5969 public long getBlockDeletionStartTime() { 5970 return startTime + blockManager.getStartupDelayBlockDeletionInMs(); 5971 } 5972 5973 @Metric 5974 public long getExcessBlocks() { 5975 return blockManager.getExcessBlocksCount(); 5976 } 5977 5978 // HA-only metric 5979 @Metric 5980 public long getPostponedMisreplicatedBlocks() { 5981 return blockManager.getPostponedMisreplicatedBlocksCount(); 5982 } 5983 5984 // HA-only metric 5985 @Metric 5986 public int getPendingDataNodeMessageCount() { 5987 return blockManager.getPendingDataNodeMessageCount(); 5988 } 5989 5990 // HA-only metric 5991 @Metric 5992 public String getHAState() { 5993 return haContext.getState().toString(); 5994 } 5995 5996 // HA-only metric 5997 @Metric 5998 public long getMillisSinceLastLoadedEdits() { 5999 if (isInStandbyState() && editLogTailer != null) { 6000 return monotonicNow() - editLogTailer.getLastLoadTimeMs(); 6001 } else { 6002 return 0; 6003 } 6004 } 6005 6006 @Metric 6007 public int getBlockCapacity() { 6008 return blockManager.getCapacity(); 6009 } 6010 6011 @Override // FSNamesystemMBean 6012 public String getFSState() { 6013 return isInSafeMode() ? "safeMode" : "Operational"; 6014 } 6015 6016 private ObjectName mbeanName; 6017 private ObjectName mxbeanName; 6018 6019 /** 6020 * Register the FSNamesystem MBean using the name 6021 * "hadoop:service=NameNode,name=FSNamesystemState" 6022 */ 6023 private void registerMBean() { 6024 // We can only implement one MXBean interface, so we keep the old one. 6025 try { 6026 StandardMBean bean = new StandardMBean(this, FSNamesystemMBean.class); 6027 mbeanName = MBeans.register("NameNode", "FSNamesystemState", bean); 6028 } catch (NotCompliantMBeanException e) { 6029 throw new RuntimeException("Bad MBean setup", e); 6030 } 6031 6032 LOG.info("Registered FSNamesystemState MBean"); 6033 } 6034 6035 /** 6036 * shutdown FSNamesystem 6037 */ 6038 void shutdown() { 6039 if (snapshotManager != null) { 6040 snapshotManager.shutdown(); 6041 } 6042 if (mbeanName != null) { 6043 MBeans.unregister(mbeanName); 6044 mbeanName = null; 6045 } 6046 if (mxbeanName != null) { 6047 MBeans.unregister(mxbeanName); 6048 mxbeanName = null; 6049 } 6050 if (dir != null) { 6051 dir.shutdown(); 6052 } 6053 if (blockManager != null) { 6054 blockManager.shutdown(); 6055 } 6056 } 6057 6058 @Override // FSNamesystemMBean 6059 public int getNumLiveDataNodes() { 6060 return getBlockManager().getDatanodeManager().getNumLiveDataNodes(); 6061 } 6062 6063 @Override // FSNamesystemMBean 6064 public int getNumDeadDataNodes() { 6065 return getBlockManager().getDatanodeManager().getNumDeadDataNodes(); 6066 } 6067 6068 @Override // FSNamesystemMBean 6069 public int getNumDecomLiveDataNodes() { 6070 final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>(); 6071 getBlockManager().getDatanodeManager().fetchDatanodes(live, null, true); 6072 int liveDecommissioned = 0; 6073 for (DatanodeDescriptor node : live) { 6074 liveDecommissioned += node.isDecommissioned() ? 1 : 0; 6075 } 6076 return liveDecommissioned; 6077 } 6078 6079 @Override // FSNamesystemMBean 6080 public int getNumDecomDeadDataNodes() { 6081 final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>(); 6082 getBlockManager().getDatanodeManager().fetchDatanodes(null, dead, true); 6083 int deadDecommissioned = 0; 6084 for (DatanodeDescriptor node : dead) { 6085 deadDecommissioned += node.isDecommissioned() ? 1 : 0; 6086 } 6087 return deadDecommissioned; 6088 } 6089 6090 @Override // FSNamesystemMBean 6091 public int getVolumeFailuresTotal() { 6092 List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>(); 6093 getBlockManager().getDatanodeManager().fetchDatanodes(live, null, true); 6094 int volumeFailuresTotal = 0; 6095 for (DatanodeDescriptor node: live) { 6096 volumeFailuresTotal += node.getVolumeFailures(); 6097 } 6098 return volumeFailuresTotal; 6099 } 6100 6101 @Override // FSNamesystemMBean 6102 public long getEstimatedCapacityLostTotal() { 6103 List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>(); 6104 getBlockManager().getDatanodeManager().fetchDatanodes(live, null, true); 6105 long estimatedCapacityLostTotal = 0; 6106 for (DatanodeDescriptor node: live) { 6107 VolumeFailureSummary volumeFailureSummary = node.getVolumeFailureSummary(); 6108 if (volumeFailureSummary != null) { 6109 estimatedCapacityLostTotal += 6110 volumeFailureSummary.getEstimatedCapacityLostTotal(); 6111 } 6112 } 6113 return estimatedCapacityLostTotal; 6114 } 6115 6116 @Override // FSNamesystemMBean 6117 public int getNumDecommissioningDataNodes() { 6118 return getBlockManager().getDatanodeManager().getDecommissioningNodes() 6119 .size(); 6120 } 6121 6122 @Override // FSNamesystemMBean 6123 @Metric({"StaleDataNodes", 6124 "Number of datanodes marked stale due to delayed heartbeat"}) 6125 public int getNumStaleDataNodes() { 6126 return getBlockManager().getDatanodeManager().getNumStaleNodes(); 6127 } 6128 6129 /** 6130 * Storages are marked as "content stale" after NN restart or fails over and 6131 * before NN receives the first Heartbeat followed by the first Blockreport. 6132 */ 6133 @Override // FSNamesystemMBean 6134 public int getNumStaleStorages() { 6135 return getBlockManager().getDatanodeManager().getNumStaleStorages(); 6136 } 6137 6138 @Override // FSNamesystemMBean 6139 public String getTopUserOpCounts() { 6140 if (!topConf.isEnabled) { 6141 return null; 6142 } 6143 6144 Date now = new Date(); 6145 final List<RollingWindowManager.TopWindow> topWindows = 6146 topMetrics.getTopWindows(); 6147 Map<String, Object> topMap = new TreeMap<String, Object>(); 6148 topMap.put("windows", topWindows); 6149 topMap.put("timestamp", DFSUtil.dateToIso8601String(now)); 6150 ObjectMapper mapper = new ObjectMapper(); 6151 try { 6152 return mapper.writeValueAsString(topMap); 6153 } catch (IOException e) { 6154 LOG.warn("Failed to fetch TopUser metrics", e); 6155 } 6156 return null; 6157 } 6158 6159 /** 6160 * Increments, logs and then returns the stamp 6161 */ 6162 long nextGenerationStamp(boolean legacyBlock) 6163 throws IOException, SafeModeException { 6164 assert hasWriteLock(); 6165 checkNameNodeSafeMode("Cannot get next generation stamp"); 6166 6167 long gs = blockIdManager.nextGenerationStamp(legacyBlock); 6168 if (legacyBlock) { 6169 getEditLog().logGenerationStampV1(gs); 6170 } else { 6171 getEditLog().logGenerationStampV2(gs); 6172 } 6173 6174 // NB: callers sync the log 6175 return gs; 6176 } 6177 6178 /** 6179 * Increments, logs and then returns the block ID 6180 */ 6181 private long nextBlockId() throws IOException { 6182 assert hasWriteLock(); 6183 checkNameNodeSafeMode("Cannot get next block ID"); 6184 final long blockId = blockIdManager.nextBlockId(); 6185 getEditLog().logAllocateBlockId(blockId); 6186 // NB: callers sync the log 6187 return blockId; 6188 } 6189 6190 private boolean isFileDeleted(INodeFile file) { 6191 // Not in the inodeMap or in the snapshot but marked deleted. 6192 if (dir.getInode(file.getId()) == null) { 6193 return true; 6194 } 6195 6196 // look at the path hierarchy to see if one parent is deleted by recursive 6197 // deletion 6198 INode tmpChild = file; 6199 INodeDirectory tmpParent = file.getParent(); 6200 while (true) { 6201 if (tmpParent == null) { 6202 return true; 6203 } 6204 6205 INode childINode = tmpParent.getChild(tmpChild.getLocalNameBytes(), 6206 Snapshot.CURRENT_STATE_ID); 6207 if (childINode == null || !childINode.equals(tmpChild)) { 6208 // a newly created INode with the same name as an already deleted one 6209 // would be a different INode than the deleted one 6210 return true; 6211 } 6212 6213 if (tmpParent.isRoot()) { 6214 break; 6215 } 6216 6217 tmpChild = tmpParent; 6218 tmpParent = tmpParent.getParent(); 6219 } 6220 6221 if (file.isWithSnapshot() && 6222 file.getFileWithSnapshotFeature().isCurrentFileDeleted()) { 6223 return true; 6224 } 6225 return false; 6226 } 6227 6228 private INodeFile checkUCBlock(ExtendedBlock block, 6229 String clientName) throws IOException { 6230 assert hasWriteLock(); 6231 checkNameNodeSafeMode("Cannot get a new generation stamp and an " 6232 + "access token for block " + block); 6233 6234 // check stored block state 6235 BlockInfoContiguous storedBlock = getStoredBlock(ExtendedBlock.getLocalBlock(block)); 6236 if (storedBlock == null || 6237 storedBlock.getBlockUCState() != BlockUCState.UNDER_CONSTRUCTION) { 6238 throw new IOException(block + 6239 " does not exist or is not under Construction" + storedBlock); 6240 } 6241 6242 // check file inode 6243 final INodeFile file = ((INode)storedBlock.getBlockCollection()).asFile(); 6244 if (file == null || !file.isUnderConstruction() || isFileDeleted(file)) { 6245 throw new IOException("The file " + storedBlock + 6246 " belonged to does not exist or it is not under construction."); 6247 } 6248 6249 // check lease 6250 if (clientName == null 6251 || !clientName.equals(file.getFileUnderConstructionFeature() 6252 .getClientName())) { 6253 throw new LeaseExpiredException("Lease mismatch: " + block + 6254 " is accessed by a non lease holder " + clientName); 6255 } 6256 6257 return file; 6258 } 6259 6260 /** 6261 * Client is reporting some bad block locations. 6262 */ 6263 void reportBadBlocks(LocatedBlock[] blocks) throws IOException { 6264 checkOperation(OperationCategory.WRITE); 6265 NameNode.stateChangeLog.info("*DIR* reportBadBlocks"); 6266 writeLock(); 6267 try { 6268 checkOperation(OperationCategory.WRITE); 6269 for (int i = 0; i < blocks.length; i++) { 6270 ExtendedBlock blk = blocks[i].getBlock(); 6271 DatanodeInfo[] nodes = blocks[i].getLocations(); 6272 String[] storageIDs = blocks[i].getStorageIDs(); 6273 for (int j = 0; j < nodes.length; j++) { 6274 blockManager.findAndMarkBlockAsCorrupt(blk, nodes[j], 6275 storageIDs == null ? null: storageIDs[j], 6276 "client machine reported it"); 6277 } 6278 } 6279 } finally { 6280 writeUnlock(); 6281 } 6282 } 6283 6284 /** 6285 * Get a new generation stamp together with an access token for 6286 * a block under construction 6287 * 6288 * This method is called for recovering a failed pipeline or setting up 6289 * a pipeline to append to a block. 6290 * 6291 * @param block a block 6292 * @param clientName the name of a client 6293 * @return a located block with a new generation stamp and an access token 6294 * @throws IOException if any error occurs 6295 */ 6296 LocatedBlock updateBlockForPipeline(ExtendedBlock block, 6297 String clientName) throws IOException { 6298 LocatedBlock locatedBlock; 6299 checkOperation(OperationCategory.WRITE); 6300 writeLock(); 6301 try { 6302 checkOperation(OperationCategory.WRITE); 6303 6304 // check vadility of parameters 6305 checkUCBlock(block, clientName); 6306 6307 // get a new generation stamp and an access token 6308 block.setGenerationStamp(nextGenerationStamp(blockIdManager.isLegacyBlock(block.getLocalBlock()))); 6309 locatedBlock = new LocatedBlock(block, new DatanodeInfo[0]); 6310 blockManager.setBlockToken(locatedBlock, AccessMode.WRITE); 6311 } finally { 6312 writeUnlock(); 6313 } 6314 // Ensure we record the new generation stamp 6315 getEditLog().logSync(); 6316 return locatedBlock; 6317 } 6318 6319 /** 6320 * Update a pipeline for a block under construction 6321 * 6322 * @param clientName the name of the client 6323 * @param oldBlock and old block 6324 * @param newBlock a new block with a new generation stamp and length 6325 * @param newNodes datanodes in the pipeline 6326 * @throws IOException if any error occurs 6327 */ 6328 void updatePipeline( 6329 String clientName, ExtendedBlock oldBlock, ExtendedBlock newBlock, 6330 DatanodeID[] newNodes, String[] newStorageIDs, boolean logRetryCache) 6331 throws IOException { 6332 checkOperation(OperationCategory.WRITE); 6333 6334 LOG.info("updatePipeline(" + oldBlock.getLocalBlock() 6335 + ", newGS=" + newBlock.getGenerationStamp() 6336 + ", newLength=" + newBlock.getNumBytes() 6337 + ", newNodes=" + Arrays.asList(newNodes) 6338 + ", client=" + clientName 6339 + ")"); 6340 waitForLoadingFSImage(); 6341 writeLock(); 6342 try { 6343 checkOperation(OperationCategory.WRITE); 6344 checkNameNodeSafeMode("Pipeline not updated"); 6345 assert newBlock.getBlockId()==oldBlock.getBlockId() : newBlock + " and " 6346 + oldBlock + " has different block identifier"; 6347 updatePipelineInternal(clientName, oldBlock, newBlock, newNodes, 6348 newStorageIDs, logRetryCache); 6349 } finally { 6350 writeUnlock(); 6351 } 6352 getEditLog().logSync(); 6353 LOG.info("updatePipeline(" + oldBlock.getLocalBlock() + " => " 6354 + newBlock.getLocalBlock() + ") success"); 6355 } 6356 6357 private void updatePipelineInternal(String clientName, ExtendedBlock oldBlock, 6358 ExtendedBlock newBlock, DatanodeID[] newNodes, String[] newStorageIDs, 6359 boolean logRetryCache) 6360 throws IOException { 6361 assert hasWriteLock(); 6362 // check the vadility of the block and lease holder name 6363 final INodeFile pendingFile = checkUCBlock(oldBlock, clientName); 6364 final BlockInfoContiguousUnderConstruction blockinfo 6365 = (BlockInfoContiguousUnderConstruction)pendingFile.getLastBlock(); 6366 6367 // check new GS & length: this is not expected 6368 if (newBlock.getGenerationStamp() <= blockinfo.getGenerationStamp() || 6369 newBlock.getNumBytes() < blockinfo.getNumBytes()) { 6370 String msg = "Update " + oldBlock + " (len = " + 6371 blockinfo.getNumBytes() + ") to an older state: " + newBlock + 6372 " (len = " + newBlock.getNumBytes() +")"; 6373 LOG.warn(msg); 6374 throw new IOException(msg); 6375 } 6376 6377 // Update old block with the new generation stamp and new length 6378 blockinfo.setNumBytes(newBlock.getNumBytes()); 6379 blockinfo.setGenerationStampAndVerifyReplicas(newBlock.getGenerationStamp()); 6380 6381 // find the DatanodeDescriptor objects 6382 final DatanodeStorageInfo[] storages = blockManager.getDatanodeManager() 6383 .getDatanodeStorageInfos(newNodes, newStorageIDs); 6384 blockinfo.setExpectedLocations(storages); 6385 6386 String src = pendingFile.getFullPathName(); 6387 persistBlocks(src, pendingFile, logRetryCache); 6388 } 6389 6390 // rename was successful. If any part of the renamed subtree had 6391 // files that were being written to, update with new filename. 6392 void unprotectedChangeLease(String src, String dst) { 6393 assert hasWriteLock(); 6394 leaseManager.changeLease(src, dst); 6395 } 6396 6397 /** 6398 * Serializes leases. 6399 */ 6400 void saveFilesUnderConstruction(DataOutputStream out, 6401 Map<Long, INodeFile> snapshotUCMap) throws IOException { 6402 // This is run by an inferior thread of saveNamespace, which holds a read 6403 // lock on our behalf. If we took the read lock here, we could block 6404 // for fairness if a writer is waiting on the lock. 6405 synchronized (leaseManager) { 6406 Map<String, INodeFile> nodes = leaseManager.getINodesUnderConstruction(); 6407 for (Map.Entry<String, INodeFile> entry : nodes.entrySet()) { 6408 // TODO: for HDFS-5428, because of rename operations, some 6409 // under-construction files that are 6410 // in the current fs directory can also be captured in the 6411 // snapshotUCMap. We should remove them from the snapshotUCMap. 6412 snapshotUCMap.remove(entry.getValue().getId()); 6413 } 6414 6415 out.writeInt(nodes.size() + snapshotUCMap.size()); // write the size 6416 for (Map.Entry<String, INodeFile> entry : nodes.entrySet()) { 6417 FSImageSerialization.writeINodeUnderConstruction( 6418 out, entry.getValue(), entry.getKey()); 6419 } 6420 for (Map.Entry<Long, INodeFile> entry : snapshotUCMap.entrySet()) { 6421 // for those snapshot INodeFileUC, we use "/.reserved/.inodes/<inodeid>" 6422 // as their paths 6423 StringBuilder b = new StringBuilder(); 6424 b.append(FSDirectory.DOT_RESERVED_PATH_PREFIX) 6425 .append(Path.SEPARATOR).append(FSDirectory.DOT_INODES_STRING) 6426 .append(Path.SEPARATOR).append(entry.getValue().getId()); 6427 FSImageSerialization.writeINodeUnderConstruction( 6428 out, entry.getValue(), b.toString()); 6429 } 6430 } 6431 } 6432 6433 /** 6434 * @return all the under-construction files in the lease map 6435 */ 6436 Map<String, INodeFile> getFilesUnderConstruction() { 6437 synchronized (leaseManager) { 6438 return leaseManager.getINodesUnderConstruction(); 6439 } 6440 } 6441 6442 /** 6443 * Register a Backup name-node, verifying that it belongs 6444 * to the correct namespace, and adding it to the set of 6445 * active journals if necessary. 6446 * 6447 * @param bnReg registration of the new BackupNode 6448 * @param nnReg registration of this NameNode 6449 * @throws IOException if the namespace IDs do not match 6450 */ 6451 void registerBackupNode(NamenodeRegistration bnReg, 6452 NamenodeRegistration nnReg) throws IOException { 6453 writeLock(); 6454 try { 6455 if(getFSImage().getStorage().getNamespaceID() 6456 != bnReg.getNamespaceID()) 6457 throw new IOException("Incompatible namespaceIDs: " 6458 + " Namenode namespaceID = " 6459 + getFSImage().getStorage().getNamespaceID() + "; " 6460 + bnReg.getRole() + 6461 " node namespaceID = " + bnReg.getNamespaceID()); 6462 if (bnReg.getRole() == NamenodeRole.BACKUP) { 6463 getFSImage().getEditLog().registerBackupNode( 6464 bnReg, nnReg); 6465 } 6466 } finally { 6467 writeUnlock(); 6468 } 6469 } 6470 6471 /** 6472 * Release (unregister) backup node. 6473 * <p> 6474 * Find and remove the backup stream corresponding to the node. 6475 * @throws IOException 6476 */ 6477 void releaseBackupNode(NamenodeRegistration registration) 6478 throws IOException { 6479 checkOperation(OperationCategory.WRITE); 6480 writeLock(); 6481 try { 6482 checkOperation(OperationCategory.WRITE); 6483 if(getFSImage().getStorage().getNamespaceID() 6484 != registration.getNamespaceID()) 6485 throw new IOException("Incompatible namespaceIDs: " 6486 + " Namenode namespaceID = " 6487 + getFSImage().getStorage().getNamespaceID() + "; " 6488 + registration.getRole() + 6489 " node namespaceID = " + registration.getNamespaceID()); 6490 getEditLog().releaseBackupStream(registration); 6491 } finally { 6492 writeUnlock(); 6493 } 6494 } 6495 6496 static class CorruptFileBlockInfo { 6497 final String path; 6498 final Block block; 6499 6500 public CorruptFileBlockInfo(String p, Block b) { 6501 path = p; 6502 block = b; 6503 } 6504 6505 @Override 6506 public String toString() { 6507 return block.getBlockName() + "\t" + path; 6508 } 6509 } 6510 /** 6511 * @param path Restrict corrupt files to this portion of namespace. 6512 * @param cookieTab Support for continuation; cookieTab tells where 6513 * to start from 6514 * @return a list in which each entry describes a corrupt file/block 6515 * @throws IOException 6516 */ 6517 Collection<CorruptFileBlockInfo> listCorruptFileBlocks(String path, 6518 String[] cookieTab) throws IOException { 6519 checkSuperuserPrivilege(); 6520 checkOperation(OperationCategory.READ); 6521 6522 int count = 0; 6523 ArrayList<CorruptFileBlockInfo> corruptFiles = 6524 new ArrayList<CorruptFileBlockInfo>(); 6525 if (cookieTab == null) { 6526 cookieTab = new String[] { null }; 6527 } 6528 6529 // Do a quick check if there are any corrupt files without taking the lock 6530 if (blockManager.getMissingBlocksCount() == 0) { 6531 if (cookieTab[0] == null) { 6532 cookieTab[0] = String.valueOf(getIntCookie(cookieTab[0])); 6533 } 6534 if (LOG.isDebugEnabled()) { 6535 LOG.debug("there are no corrupt file blocks."); 6536 } 6537 return corruptFiles; 6538 } 6539 6540 readLock(); 6541 try { 6542 checkOperation(OperationCategory.READ); 6543 if (!isPopulatingReplQueues()) { 6544 throw new IOException("Cannot run listCorruptFileBlocks because " + 6545 "replication queues have not been initialized."); 6546 } 6547 // print a limited # of corrupt files per call 6548 6549 final Iterator<Block> blkIterator = blockManager.getCorruptReplicaBlockIterator(); 6550 6551 int skip = getIntCookie(cookieTab[0]); 6552 for (int i = 0; i < skip && blkIterator.hasNext(); i++) { 6553 blkIterator.next(); 6554 } 6555 6556 while (blkIterator.hasNext()) { 6557 Block blk = blkIterator.next(); 6558 final INode inode = (INode)blockManager.getBlockCollection(blk); 6559 skip++; 6560 if (inode != null && blockManager.countNodes(blk).liveReplicas() == 0) { 6561 String src = FSDirectory.getFullPathName(inode); 6562 if (src.startsWith(path)){ 6563 corruptFiles.add(new CorruptFileBlockInfo(src, blk)); 6564 count++; 6565 if (count >= DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED) 6566 break; 6567 } 6568 } 6569 } 6570 cookieTab[0] = String.valueOf(skip); 6571 if (LOG.isDebugEnabled()) { 6572 LOG.debug("list corrupt file blocks returned: " + count); 6573 } 6574 return corruptFiles; 6575 } finally { 6576 readUnlock(); 6577 } 6578 } 6579 6580 /** 6581 * Convert string cookie to integer. 6582 */ 6583 private static int getIntCookie(String cookie){ 6584 int c; 6585 if(cookie == null){ 6586 c = 0; 6587 } else { 6588 try{ 6589 c = Integer.parseInt(cookie); 6590 }catch (NumberFormatException e) { 6591 c = 0; 6592 } 6593 } 6594 c = Math.max(0, c); 6595 return c; 6596 } 6597 6598 /** 6599 * Create delegation token secret manager 6600 */ 6601 private DelegationTokenSecretManager createDelegationTokenSecretManager( 6602 Configuration conf) { 6603 return new DelegationTokenSecretManager(conf.getLong( 6604 DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY, 6605 DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT), 6606 conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY, 6607 DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT), 6608 conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY, 6609 DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT), 6610 DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL, 6611 conf.getBoolean(DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY, 6612 DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT), 6613 this); 6614 } 6615 6616 /** 6617 * Returns the DelegationTokenSecretManager instance in the namesystem. 6618 * @return delegation token secret manager object 6619 */ 6620 DelegationTokenSecretManager getDelegationTokenSecretManager() { 6621 return dtSecretManager; 6622 } 6623 6624 /** 6625 * @param renewer Renewer information 6626 * @return delegation toek 6627 * @throws IOException on error 6628 */ 6629 Token<DelegationTokenIdentifier> getDelegationToken(Text renewer) 6630 throws IOException { 6631 Token<DelegationTokenIdentifier> token; 6632 checkOperation(OperationCategory.WRITE); 6633 writeLock(); 6634 try { 6635 checkOperation(OperationCategory.WRITE); 6636 checkNameNodeSafeMode("Cannot issue delegation token"); 6637 if (!isAllowedDelegationTokenOp()) { 6638 throw new IOException( 6639 "Delegation Token can be issued only with kerberos or web authentication"); 6640 } 6641 if (dtSecretManager == null || !dtSecretManager.isRunning()) { 6642 LOG.warn("trying to get DT with no secret manager running"); 6643 return null; 6644 } 6645 6646 UserGroupInformation ugi = getRemoteUser(); 6647 String user = ugi.getUserName(); 6648 Text owner = new Text(user); 6649 Text realUser = null; 6650 if (ugi.getRealUser() != null) { 6651 realUser = new Text(ugi.getRealUser().getUserName()); 6652 } 6653 DelegationTokenIdentifier dtId = new DelegationTokenIdentifier(owner, 6654 renewer, realUser); 6655 token = new Token<DelegationTokenIdentifier>( 6656 dtId, dtSecretManager); 6657 long expiryTime = dtSecretManager.getTokenExpiryTime(dtId); 6658 getEditLog().logGetDelegationToken(dtId, expiryTime); 6659 } finally { 6660 writeUnlock(); 6661 } 6662 getEditLog().logSync(); 6663 return token; 6664 } 6665 6666 /** 6667 * 6668 * @param token token to renew 6669 * @return new expiryTime of the token 6670 * @throws InvalidToken if {@code token} is invalid 6671 * @throws IOException on other errors 6672 */ 6673 long renewDelegationToken(Token<DelegationTokenIdentifier> token) 6674 throws InvalidToken, IOException { 6675 long expiryTime; 6676 checkOperation(OperationCategory.WRITE); 6677 writeLock(); 6678 try { 6679 checkOperation(OperationCategory.WRITE); 6680 6681 checkNameNodeSafeMode("Cannot renew delegation token"); 6682 if (!isAllowedDelegationTokenOp()) { 6683 throw new IOException( 6684 "Delegation Token can be renewed only with kerberos or web authentication"); 6685 } 6686 String renewer = getRemoteUser().getShortUserName(); 6687 expiryTime = dtSecretManager.renewToken(token, renewer); 6688 DelegationTokenIdentifier id = new DelegationTokenIdentifier(); 6689 ByteArrayInputStream buf = new ByteArrayInputStream(token.getIdentifier()); 6690 DataInputStream in = new DataInputStream(buf); 6691 id.readFields(in); 6692 getEditLog().logRenewDelegationToken(id, expiryTime); 6693 } finally { 6694 writeUnlock(); 6695 } 6696 getEditLog().logSync(); 6697 return expiryTime; 6698 } 6699 6700 /** 6701 * 6702 * @param token token to cancel 6703 * @throws IOException on error 6704 */ 6705 void cancelDelegationToken(Token<DelegationTokenIdentifier> token) 6706 throws IOException { 6707 checkOperation(OperationCategory.WRITE); 6708 writeLock(); 6709 try { 6710 checkOperation(OperationCategory.WRITE); 6711 6712 checkNameNodeSafeMode("Cannot cancel delegation token"); 6713 String canceller = getRemoteUser().getUserName(); 6714 DelegationTokenIdentifier id = dtSecretManager 6715 .cancelToken(token, canceller); 6716 getEditLog().logCancelDelegationToken(id); 6717 } finally { 6718 writeUnlock(); 6719 } 6720 getEditLog().logSync(); 6721 } 6722 6723 /** 6724 * @param out save state of the secret manager 6725 * @param sdPath String storage directory path 6726 */ 6727 void saveSecretManagerStateCompat(DataOutputStream out, String sdPath) 6728 throws IOException { 6729 dtSecretManager.saveSecretManagerStateCompat(out, sdPath); 6730 } 6731 6732 SecretManagerState saveSecretManagerState() { 6733 return dtSecretManager.saveSecretManagerState(); 6734 } 6735 6736 /** 6737 * @param in load the state of secret manager from input stream 6738 */ 6739 void loadSecretManagerStateCompat(DataInput in) throws IOException { 6740 dtSecretManager.loadSecretManagerStateCompat(in); 6741 } 6742 6743 void loadSecretManagerState(SecretManagerSection s, 6744 List<SecretManagerSection.DelegationKey> keys, 6745 List<SecretManagerSection.PersistToken> tokens) throws IOException { 6746 dtSecretManager.loadSecretManagerState(new SecretManagerState(s, keys, tokens)); 6747 } 6748 6749 /** 6750 * Log the updateMasterKey operation to edit logs 6751 * 6752 * @param key new delegation key. 6753 */ 6754 public void logUpdateMasterKey(DelegationKey key) { 6755 6756 assert !isInSafeMode() : 6757 "this should never be called while in safemode, since we stop " + 6758 "the DT manager before entering safemode!"; 6759 // No need to hold FSN lock since we don't access any internal 6760 // structures, and this is stopped before the FSN shuts itself 6761 // down, etc. 6762 getEditLog().logUpdateMasterKey(key); 6763 getEditLog().logSync(); 6764 } 6765 6766 /** 6767 * Log the cancellation of expired tokens to edit logs 6768 * 6769 * @param id token identifier to cancel 6770 */ 6771 public void logExpireDelegationToken(DelegationTokenIdentifier id) { 6772 assert !isInSafeMode() : 6773 "this should never be called while in safemode, since we stop " + 6774 "the DT manager before entering safemode!"; 6775 // No need to hold FSN lock since we don't access any internal 6776 // structures, and this is stopped before the FSN shuts itself 6777 // down, etc. 6778 getEditLog().logCancelDelegationToken(id); 6779 } 6780 6781 private void logReassignLease(String leaseHolder, String src, 6782 String newHolder) { 6783 assert hasWriteLock(); 6784 getEditLog().logReassignLease(leaseHolder, src, newHolder); 6785 } 6786 6787 /** 6788 * 6789 * @return true if delegation token operation is allowed 6790 */ 6791 private boolean isAllowedDelegationTokenOp() throws IOException { 6792 AuthenticationMethod authMethod = getConnectionAuthenticationMethod(); 6793 if (UserGroupInformation.isSecurityEnabled() 6794 && (authMethod != AuthenticationMethod.KERBEROS) 6795 && (authMethod != AuthenticationMethod.KERBEROS_SSL) 6796 && (authMethod != AuthenticationMethod.CERTIFICATE)) { 6797 return false; 6798 } 6799 return true; 6800 } 6801 6802 /** 6803 * Returns authentication method used to establish the connection 6804 * @return AuthenticationMethod used to establish connection 6805 * @throws IOException 6806 */ 6807 private AuthenticationMethod getConnectionAuthenticationMethod() 6808 throws IOException { 6809 UserGroupInformation ugi = getRemoteUser(); 6810 AuthenticationMethod authMethod = ugi.getAuthenticationMethod(); 6811 if (authMethod == AuthenticationMethod.PROXY) { 6812 authMethod = ugi.getRealUser().getAuthenticationMethod(); 6813 } 6814 return authMethod; 6815 } 6816 6817 /** 6818 * Client invoked methods are invoked over RPC and will be in 6819 * RPC call context even if the client exits. 6820 */ 6821 boolean isExternalInvocation() { 6822 return Server.isRpcInvocation() || NamenodeWebHdfsMethods.isWebHdfsInvocation(); 6823 } 6824 6825 private static InetAddress getRemoteIp() { 6826 InetAddress ip = Server.getRemoteIp(); 6827 if (ip != null) { 6828 return ip; 6829 } 6830 return NamenodeWebHdfsMethods.getRemoteIp(); 6831 } 6832 6833 // optimize ugi lookup for RPC operations to avoid a trip through 6834 // UGI.getCurrentUser which is synch'ed 6835 private static UserGroupInformation getRemoteUser() throws IOException { 6836 return NameNode.getRemoteUser(); 6837 } 6838 6839 /** 6840 * Log fsck event in the audit log 6841 */ 6842 void logFsckEvent(String src, InetAddress remoteAddress) throws IOException { 6843 if (isAuditEnabled()) { 6844 logAuditEvent(true, getRemoteUser(), 6845 remoteAddress, 6846 "fsck", src, null, null); 6847 } 6848 } 6849 /** 6850 * Register NameNodeMXBean 6851 */ 6852 private void registerMXBean() { 6853 mxbeanName = MBeans.register("NameNode", "NameNodeInfo", this); 6854 } 6855 6856 /** 6857 * Class representing Namenode information for JMX interfaces 6858 */ 6859 @Override // NameNodeMXBean 6860 public String getVersion() { 6861 return VersionInfo.getVersion() + ", r" + VersionInfo.getRevision(); 6862 } 6863 6864 @Override // NameNodeMXBean 6865 public long getUsed() { 6866 return this.getCapacityUsed(); 6867 } 6868 6869 @Override // NameNodeMXBean 6870 public long getFree() { 6871 return this.getCapacityRemaining(); 6872 } 6873 6874 @Override // NameNodeMXBean 6875 public long getTotal() { 6876 return this.getCapacityTotal(); 6877 } 6878 6879 @Override // NameNodeMXBean 6880 public String getSafemode() { 6881 if (!this.isInSafeMode()) 6882 return ""; 6883 return "Safe mode is ON. " + this.getSafeModeTip(); 6884 } 6885 6886 @Override // NameNodeMXBean 6887 public boolean isUpgradeFinalized() { 6888 return this.getFSImage().isUpgradeFinalized(); 6889 } 6890 6891 @Override // NameNodeMXBean 6892 public long getNonDfsUsedSpace() { 6893 return datanodeStatistics.getCapacityUsedNonDFS(); 6894 } 6895 6896 @Override // NameNodeMXBean 6897 public float getPercentUsed() { 6898 return datanodeStatistics.getCapacityUsedPercent(); 6899 } 6900 6901 @Override // NameNodeMXBean 6902 public long getBlockPoolUsedSpace() { 6903 return datanodeStatistics.getBlockPoolUsed(); 6904 } 6905 6906 @Override // NameNodeMXBean 6907 public float getPercentBlockPoolUsed() { 6908 return datanodeStatistics.getPercentBlockPoolUsed(); 6909 } 6910 6911 @Override // NameNodeMXBean 6912 public float getPercentRemaining() { 6913 return datanodeStatistics.getCapacityRemainingPercent(); 6914 } 6915 6916 @Override // NameNodeMXBean 6917 public long getCacheCapacity() { 6918 return datanodeStatistics.getCacheCapacity(); 6919 } 6920 6921 @Override // NameNodeMXBean 6922 public long getCacheUsed() { 6923 return datanodeStatistics.getCacheUsed(); 6924 } 6925 6926 @Override // NameNodeMXBean 6927 public long getTotalBlocks() { 6928 return getBlocksTotal(); 6929 } 6930 6931 @Override // NameNodeMXBean 6932 @Metric 6933 public long getTotalFiles() { 6934 return getFilesTotal(); 6935 } 6936 6937 @Override // NameNodeMXBean 6938 public long getNumberOfMissingBlocks() { 6939 return getMissingBlocksCount(); 6940 } 6941 6942 @Override // NameNodeMXBean 6943 public long getNumberOfMissingBlocksWithReplicationFactorOne() { 6944 return getMissingReplOneBlocksCount(); 6945 } 6946 6947 @Override // NameNodeMXBean 6948 public int getThreads() { 6949 return ManagementFactory.getThreadMXBean().getThreadCount(); 6950 } 6951 6952 /** 6953 * Returned information is a JSON representation of map with host name as the 6954 * key and value is a map of live node attribute keys to its values 6955 */ 6956 @Override // NameNodeMXBean 6957 public String getLiveNodes() { 6958 final Map<String, Map<String,Object>> info = 6959 new HashMap<String, Map<String,Object>>(); 6960 final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>(); 6961 blockManager.getDatanodeManager().fetchDatanodes(live, null, true); 6962 for (DatanodeDescriptor node : live) { 6963 ImmutableMap.Builder<String, Object> innerinfo = 6964 ImmutableMap.<String,Object>builder(); 6965 innerinfo 6966 .put("infoAddr", node.getInfoAddr()) 6967 .put("infoSecureAddr", node.getInfoSecureAddr()) 6968 .put("xferaddr", node.getXferAddr()) 6969 .put("lastContact", getLastContact(node)) 6970 .put("usedSpace", getDfsUsed(node)) 6971 .put("adminState", node.getAdminState().toString()) 6972 .put("nonDfsUsedSpace", node.getNonDfsUsed()) 6973 .put("capacity", node.getCapacity()) 6974 .put("numBlocks", node.numBlocks()) 6975 .put("version", node.getSoftwareVersion()) 6976 .put("used", node.getDfsUsed()) 6977 .put("remaining", node.getRemaining()) 6978 .put("blockScheduled", node.getBlocksScheduled()) 6979 .put("blockPoolUsed", node.getBlockPoolUsed()) 6980 .put("blockPoolUsedPercent", node.getBlockPoolUsedPercent()) 6981 .put("volfails", node.getVolumeFailures()); 6982 VolumeFailureSummary volumeFailureSummary = node.getVolumeFailureSummary(); 6983 if (volumeFailureSummary != null) { 6984 innerinfo 6985 .put("failedStorageLocations", 6986 volumeFailureSummary.getFailedStorageLocations()) 6987 .put("lastVolumeFailureDate", 6988 volumeFailureSummary.getLastVolumeFailureDate()) 6989 .put("estimatedCapacityLostTotal", 6990 volumeFailureSummary.getEstimatedCapacityLostTotal()); 6991 } 6992 info.put(node.getHostName() + ":" + node.getXferPort(), innerinfo.build()); 6993 } 6994 return JSON.toString(info); 6995 } 6996 6997 /** 6998 * Returned information is a JSON representation of map with host name as the 6999 * key and value is a map of dead node attribute keys to its values 7000 */ 7001 @Override // NameNodeMXBean 7002 public String getDeadNodes() { 7003 final Map<String, Map<String, Object>> info = 7004 new HashMap<String, Map<String, Object>>(); 7005 final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>(); 7006 blockManager.getDatanodeManager().fetchDatanodes(null, dead, true); 7007 for (DatanodeDescriptor node : dead) { 7008 Map<String, Object> innerinfo = ImmutableMap.<String, Object>builder() 7009 .put("lastContact", getLastContact(node)) 7010 .put("decommissioned", node.isDecommissioned()) 7011 .put("xferaddr", node.getXferAddr()) 7012 .build(); 7013 info.put(node.getHostName() + ":" + node.getXferPort(), innerinfo); 7014 } 7015 return JSON.toString(info); 7016 } 7017 7018 /** 7019 * Returned information is a JSON representation of map with host name as the 7020 * key and value is a map of decommissioning node attribute keys to its 7021 * values 7022 */ 7023 @Override // NameNodeMXBean 7024 public String getDecomNodes() { 7025 final Map<String, Map<String, Object>> info = 7026 new HashMap<String, Map<String, Object>>(); 7027 final List<DatanodeDescriptor> decomNodeList = blockManager.getDatanodeManager( 7028 ).getDecommissioningNodes(); 7029 for (DatanodeDescriptor node : decomNodeList) { 7030 Map<String, Object> innerinfo = ImmutableMap 7031 .<String, Object> builder() 7032 .put("xferaddr", node.getXferAddr()) 7033 .put("underReplicatedBlocks", 7034 node.decommissioningStatus.getUnderReplicatedBlocks()) 7035 .put("decommissionOnlyReplicas", 7036 node.decommissioningStatus.getDecommissionOnlyReplicas()) 7037 .put("underReplicateInOpenFiles", 7038 node.decommissioningStatus.getUnderReplicatedInOpenFiles()) 7039 .build(); 7040 info.put(node.getHostName() + ":" + node.getXferPort(), innerinfo); 7041 } 7042 return JSON.toString(info); 7043 } 7044 7045 private long getLastContact(DatanodeDescriptor alivenode) { 7046 return (monotonicNow() - alivenode.getLastUpdateMonotonic())/1000; 7047 } 7048 7049 private long getDfsUsed(DatanodeDescriptor alivenode) { 7050 return alivenode.getDfsUsed(); 7051 } 7052 7053 @Override // NameNodeMXBean 7054 public String getClusterId() { 7055 return getFSImage().getStorage().getClusterID(); 7056 } 7057 7058 @Override // NameNodeMXBean 7059 public String getBlockPoolId() { 7060 return blockPoolId; 7061 } 7062 7063 @Override // NameNodeMXBean 7064 public String getNameDirStatuses() { 7065 Map<String, Map<File, StorageDirType>> statusMap = 7066 new HashMap<String, Map<File, StorageDirType>>(); 7067 7068 Map<File, StorageDirType> activeDirs = new HashMap<File, StorageDirType>(); 7069 for (Iterator<StorageDirectory> it 7070 = getFSImage().getStorage().dirIterator(); it.hasNext();) { 7071 StorageDirectory st = it.next(); 7072 activeDirs.put(st.getRoot(), st.getStorageDirType()); 7073 } 7074 statusMap.put("active", activeDirs); 7075 7076 List<Storage.StorageDirectory> removedStorageDirs 7077 = getFSImage().getStorage().getRemovedStorageDirs(); 7078 Map<File, StorageDirType> failedDirs = new HashMap<File, StorageDirType>(); 7079 for (StorageDirectory st : removedStorageDirs) { 7080 failedDirs.put(st.getRoot(), st.getStorageDirType()); 7081 } 7082 statusMap.put("failed", failedDirs); 7083 7084 return JSON.toString(statusMap); 7085 } 7086 7087 @Override // NameNodeMXBean 7088 public String getNodeUsage() { 7089 float median = 0; 7090 float max = 0; 7091 float min = 0; 7092 float dev = 0; 7093 7094 final Map<String, Map<String,Object>> info = 7095 new HashMap<String, Map<String,Object>>(); 7096 final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>(); 7097 blockManager.getDatanodeManager().fetchDatanodes(live, null, true); 7098 7099 if (live.size() > 0) { 7100 float totalDfsUsed = 0; 7101 float[] usages = new float[live.size()]; 7102 int i = 0; 7103 for (DatanodeDescriptor dn : live) { 7104 usages[i++] = dn.getDfsUsedPercent(); 7105 totalDfsUsed += dn.getDfsUsedPercent(); 7106 } 7107 totalDfsUsed /= live.size(); 7108 Arrays.sort(usages); 7109 median = usages[usages.length / 2]; 7110 max = usages[usages.length - 1]; 7111 min = usages[0]; 7112 7113 for (i = 0; i < usages.length; i++) { 7114 dev += (usages[i] - totalDfsUsed) * (usages[i] - totalDfsUsed); 7115 } 7116 dev = (float) Math.sqrt(dev / usages.length); 7117 } 7118 7119 final Map<String, Object> innerInfo = new HashMap<String, Object>(); 7120 innerInfo.put("min", StringUtils.format("%.2f%%", min)); 7121 innerInfo.put("median", StringUtils.format("%.2f%%", median)); 7122 innerInfo.put("max", StringUtils.format("%.2f%%", max)); 7123 innerInfo.put("stdDev", StringUtils.format("%.2f%%", dev)); 7124 info.put("nodeUsage", innerInfo); 7125 7126 return JSON.toString(info); 7127 } 7128 7129 @Override // NameNodeMXBean 7130 public String getNameJournalStatus() { 7131 List<Map<String, String>> jasList = new ArrayList<Map<String, String>>(); 7132 FSEditLog log = getFSImage().getEditLog(); 7133 if (log != null) { 7134 boolean openForWrite = log.isOpenForWrite(); 7135 for (JournalAndStream jas : log.getJournals()) { 7136 final Map<String, String> jasMap = new HashMap<String, String>(); 7137 String manager = jas.getManager().toString(); 7138 7139 jasMap.put("required", String.valueOf(jas.isRequired())); 7140 jasMap.put("disabled", String.valueOf(jas.isDisabled())); 7141 jasMap.put("manager", manager); 7142 7143 if (jas.isDisabled()) { 7144 jasMap.put("stream", "Failed"); 7145 } else if (openForWrite) { 7146 EditLogOutputStream elos = jas.getCurrentStream(); 7147 if (elos != null) { 7148 jasMap.put("stream", elos.generateReport()); 7149 } else { 7150 jasMap.put("stream", "not currently writing"); 7151 } 7152 } else { 7153 jasMap.put("stream", "open for read"); 7154 } 7155 jasList.add(jasMap); 7156 } 7157 } 7158 return JSON.toString(jasList); 7159 } 7160 7161 @Override // NameNodeMxBean 7162 public String getJournalTransactionInfo() { 7163 Map<String, String> txnIdMap = new HashMap<String, String>(); 7164 txnIdMap.put("LastAppliedOrWrittenTxId", 7165 Long.toString(this.getFSImage().getLastAppliedOrWrittenTxId())); 7166 txnIdMap.put("MostRecentCheckpointTxId", 7167 Long.toString(this.getFSImage().getMostRecentCheckpointTxId())); 7168 return JSON.toString(txnIdMap); 7169 } 7170 7171 @Override // NameNodeMXBean 7172 public String getNNStarted() { 7173 return getStartTime().toString(); 7174 } 7175 7176 @Override // NameNodeMXBean 7177 public String getCompileInfo() { 7178 return VersionInfo.getDate() + " by " + VersionInfo.getUser() + 7179 " from " + VersionInfo.getBranch(); 7180 } 7181 7182 /** @return the block manager. */ 7183 public BlockManager getBlockManager() { 7184 return blockManager; 7185 } 7186 7187 public BlockIdManager getBlockIdManager() { 7188 return blockIdManager; 7189 } 7190 7191 /** @return the FSDirectory. */ 7192 public FSDirectory getFSDirectory() { 7193 return dir; 7194 } 7195 /** Set the FSDirectory. */ 7196 @VisibleForTesting 7197 public void setFSDirectory(FSDirectory dir) { 7198 this.dir = dir; 7199 } 7200 /** @return the cache manager. */ 7201 public CacheManager getCacheManager() { 7202 return cacheManager; 7203 } 7204 7205 @Override // NameNodeMXBean 7206 public String getCorruptFiles() { 7207 List<String> list = new ArrayList<String>(); 7208 Collection<FSNamesystem.CorruptFileBlockInfo> corruptFileBlocks; 7209 try { 7210 corruptFileBlocks = listCorruptFileBlocks("/", null); 7211 int corruptFileCount = corruptFileBlocks.size(); 7212 if (corruptFileCount != 0) { 7213 for (FSNamesystem.CorruptFileBlockInfo c : corruptFileBlocks) { 7214 list.add(c.toString()); 7215 } 7216 } 7217 } catch (IOException e) { 7218 LOG.warn("Get corrupt file blocks returned error: " + e.getMessage()); 7219 } 7220 return JSON.toString(list); 7221 } 7222 7223 @Override //NameNodeMXBean 7224 public int getDistinctVersionCount() { 7225 return blockManager.getDatanodeManager().getDatanodesSoftwareVersions() 7226 .size(); 7227 } 7228 7229 @Override //NameNodeMXBean 7230 public Map<String, Integer> getDistinctVersions() { 7231 return blockManager.getDatanodeManager().getDatanodesSoftwareVersions(); 7232 } 7233 7234 @Override //NameNodeMXBean 7235 public String getSoftwareVersion() { 7236 return VersionInfo.getVersion(); 7237 } 7238 7239 /** 7240 * Verifies that the given identifier and password are valid and match. 7241 * @param identifier Token identifier. 7242 * @param password Password in the token. 7243 */ 7244 public synchronized void verifyToken(DelegationTokenIdentifier identifier, 7245 byte[] password) throws InvalidToken, RetriableException { 7246 try { 7247 getDelegationTokenSecretManager().verifyToken(identifier, password); 7248 } catch (InvalidToken it) { 7249 if (inTransitionToActive()) { 7250 throw new RetriableException(it); 7251 } 7252 throw it; 7253 } 7254 } 7255 7256 @Override 7257 public boolean isGenStampInFuture(Block block) { 7258 return blockIdManager.isGenStampInFuture(block); 7259 } 7260 7261 @VisibleForTesting 7262 public EditLogTailer getEditLogTailer() { 7263 return editLogTailer; 7264 } 7265 7266 @VisibleForTesting 7267 public void setEditLogTailerForTests(EditLogTailer tailer) { 7268 this.editLogTailer = tailer; 7269 } 7270 7271 @VisibleForTesting 7272 void setFsLockForTests(ReentrantReadWriteLock lock) { 7273 this.fsLock.coarseLock = lock; 7274 } 7275 7276 @VisibleForTesting 7277 public ReentrantReadWriteLock getFsLockForTests() { 7278 return fsLock.coarseLock; 7279 } 7280 7281 @VisibleForTesting 7282 public ReentrantLock getCpLockForTests() { 7283 return cpLock; 7284 } 7285 7286 @VisibleForTesting 7287 public SafeModeInfo getSafeModeInfoForTests() { 7288 return safeMode; 7289 } 7290 7291 @VisibleForTesting 7292 public void setNNResourceChecker(NameNodeResourceChecker nnResourceChecker) { 7293 this.nnResourceChecker = nnResourceChecker; 7294 } 7295 7296 public SnapshotManager getSnapshotManager() { 7297 return snapshotManager; 7298 } 7299 7300 /** Allow snapshot on a directory. */ 7301 void allowSnapshot(String path) throws IOException { 7302 checkOperation(OperationCategory.WRITE); 7303 boolean success = false; 7304 writeLock(); 7305 try { 7306 checkOperation(OperationCategory.WRITE); 7307 checkNameNodeSafeMode("Cannot allow snapshot for " + path); 7308 checkSuperuserPrivilege(); 7309 FSDirSnapshotOp.allowSnapshot(dir, snapshotManager, path); 7310 success = true; 7311 } finally { 7312 writeUnlock(); 7313 } 7314 getEditLog().logSync(); 7315 logAuditEvent(success, "allowSnapshot", path, null, null); 7316 } 7317 7318 /** Disallow snapshot on a directory. */ 7319 void disallowSnapshot(String path) throws IOException { 7320 checkOperation(OperationCategory.WRITE); 7321 boolean success = false; 7322 writeLock(); 7323 try { 7324 checkOperation(OperationCategory.WRITE); 7325 checkNameNodeSafeMode("Cannot disallow snapshot for " + path); 7326 checkSuperuserPrivilege(); 7327 FSDirSnapshotOp.disallowSnapshot(dir, snapshotManager, path); 7328 success = true; 7329 } finally { 7330 writeUnlock(); 7331 } 7332 getEditLog().logSync(); 7333 logAuditEvent(success, "disallowSnapshot", path, null, null); 7334 } 7335 7336 /** 7337 * Create a snapshot 7338 * @param snapshotRoot The directory path where the snapshot is taken 7339 * @param snapshotName The name of the snapshot 7340 */ 7341 String createSnapshot(String snapshotRoot, String snapshotName, 7342 boolean logRetryCache) throws IOException { 7343 String snapshotPath = null; 7344 writeLock(); 7345 try { 7346 checkOperation(OperationCategory.WRITE); 7347 checkNameNodeSafeMode("Cannot create snapshot for " + snapshotRoot); 7348 snapshotPath = FSDirSnapshotOp.createSnapshot(dir, 7349 snapshotManager, snapshotRoot, snapshotName, logRetryCache); 7350 } finally { 7351 writeUnlock(); 7352 } 7353 getEditLog().logSync(); 7354 logAuditEvent(snapshotPath != null, "createSnapshot", snapshotRoot, 7355 snapshotPath, null); 7356 return snapshotPath; 7357 } 7358 7359 /** 7360 * Rename a snapshot 7361 * @param path The directory path where the snapshot was taken 7362 * @param snapshotOldName Old snapshot name 7363 * @param snapshotNewName New snapshot name 7364 * @throws SafeModeException 7365 * @throws IOException 7366 */ 7367 void renameSnapshot( 7368 String path, String snapshotOldName, String snapshotNewName, 7369 boolean logRetryCache) throws IOException { 7370 checkOperation(OperationCategory.WRITE); 7371 boolean success = false; 7372 writeLock(); 7373 try { 7374 checkOperation(OperationCategory.WRITE); 7375 checkNameNodeSafeMode("Cannot rename snapshot for " + path); 7376 FSDirSnapshotOp.renameSnapshot(dir, snapshotManager, path, 7377 snapshotOldName, snapshotNewName, logRetryCache); 7378 success = true; 7379 } finally { 7380 writeUnlock(); 7381 } 7382 getEditLog().logSync(); 7383 String oldSnapshotRoot = Snapshot.getSnapshotPath(path, snapshotOldName); 7384 String newSnapshotRoot = Snapshot.getSnapshotPath(path, snapshotNewName); 7385 logAuditEvent(success, "renameSnapshot", oldSnapshotRoot, 7386 newSnapshotRoot, null); 7387 } 7388 7389 /** 7390 * Get the list of snapshottable directories that are owned 7391 * by the current user. Return all the snapshottable directories if the 7392 * current user is a super user. 7393 * @return The list of all the current snapshottable directories 7394 * @throws IOException 7395 */ 7396 public SnapshottableDirectoryStatus[] getSnapshottableDirListing() 7397 throws IOException { 7398 SnapshottableDirectoryStatus[] status = null; 7399 checkOperation(OperationCategory.READ); 7400 boolean success = false; 7401 readLock(); 7402 try { 7403 checkOperation(OperationCategory.READ); 7404 status = FSDirSnapshotOp.getSnapshottableDirListing(dir, snapshotManager); 7405 success = true; 7406 } finally { 7407 readUnlock(); 7408 } 7409 logAuditEvent(success, "listSnapshottableDirectory", null, null, null); 7410 return status; 7411 } 7412 7413 /** 7414 * Get the difference between two snapshots (or between a snapshot and the 7415 * current status) of a snapshottable directory. 7416 * 7417 * @param path The full path of the snapshottable directory. 7418 * @param fromSnapshot Name of the snapshot to calculate the diff from. Null 7419 * or empty string indicates the current tree. 7420 * @param toSnapshot Name of the snapshot to calculated the diff to. Null or 7421 * empty string indicates the current tree. 7422 * @return A report about the difference between {@code fromSnapshot} and 7423 * {@code toSnapshot}. Modified/deleted/created/renamed files and 7424 * directories belonging to the snapshottable directories are listed 7425 * and labeled as M/-/+/R respectively. 7426 * @throws IOException 7427 */ 7428 SnapshotDiffReport getSnapshotDiffReport(String path, 7429 String fromSnapshot, String toSnapshot) throws IOException { 7430 SnapshotDiffReport diffs = null; 7431 checkOperation(OperationCategory.READ); 7432 readLock(); 7433 try { 7434 checkOperation(OperationCategory.READ); 7435 diffs = FSDirSnapshotOp.getSnapshotDiffReport(dir, snapshotManager, 7436 path, fromSnapshot, toSnapshot); 7437 } finally { 7438 readUnlock(); 7439 } 7440 7441 logAuditEvent(diffs != null, "computeSnapshotDiff", null, null, null); 7442 return diffs; 7443 } 7444 7445 /** 7446 * Delete a snapshot of a snapshottable directory 7447 * @param snapshotRoot The snapshottable directory 7448 * @param snapshotName The name of the to-be-deleted snapshot 7449 * @throws SafeModeException 7450 * @throws IOException 7451 */ 7452 void deleteSnapshot(String snapshotRoot, String snapshotName, 7453 boolean logRetryCache) throws IOException { 7454 checkOperation(OperationCategory.WRITE); 7455 boolean success = false; 7456 writeLock(); 7457 BlocksMapUpdateInfo blocksToBeDeleted = null; 7458 try { 7459 checkOperation(OperationCategory.WRITE); 7460 checkNameNodeSafeMode("Cannot delete snapshot for " + snapshotRoot); 7461 7462 blocksToBeDeleted = FSDirSnapshotOp.deleteSnapshot(dir, snapshotManager, 7463 snapshotRoot, snapshotName, logRetryCache); 7464 success = true; 7465 } finally { 7466 writeUnlock(); 7467 } 7468 getEditLog().logSync(); 7469 7470 // Breaking the pattern as removing blocks have to happen outside of the 7471 // global lock 7472 if (blocksToBeDeleted != null) { 7473 removeBlocks(blocksToBeDeleted); 7474 } 7475 7476 String rootPath = Snapshot.getSnapshotPath(snapshotRoot, snapshotName); 7477 logAuditEvent(success, "deleteSnapshot", rootPath, null, null); 7478 } 7479 7480 /** 7481 * Remove a list of INodeDirectorySnapshottable from the SnapshotManager 7482 * @param toRemove the list of INodeDirectorySnapshottable to be removed 7483 */ 7484 void removeSnapshottableDirs(List<INodeDirectory> toRemove) { 7485 if (snapshotManager != null) { 7486 snapshotManager.removeSnapshottable(toRemove); 7487 } 7488 } 7489 7490 RollingUpgradeInfo queryRollingUpgrade() throws IOException { 7491 checkSuperuserPrivilege(); 7492 checkOperation(OperationCategory.READ); 7493 readLock(); 7494 try { 7495 if (rollingUpgradeInfo != null) { 7496 boolean hasRollbackImage = this.getFSImage().hasRollbackFSImage(); 7497 rollingUpgradeInfo.setCreatedRollbackImages(hasRollbackImage); 7498 } 7499 return rollingUpgradeInfo; 7500 } finally { 7501 readUnlock(); 7502 } 7503 } 7504 7505 RollingUpgradeInfo startRollingUpgrade() throws IOException { 7506 checkSuperuserPrivilege(); 7507 checkOperation(OperationCategory.WRITE); 7508 writeLock(); 7509 try { 7510 checkOperation(OperationCategory.WRITE); 7511 if (isRollingUpgrade()) { 7512 return rollingUpgradeInfo; 7513 } 7514 long startTime = now(); 7515 if (!haEnabled) { // for non-HA, we require NN to be in safemode 7516 startRollingUpgradeInternalForNonHA(startTime); 7517 } else { // for HA, NN cannot be in safemode 7518 checkNameNodeSafeMode("Failed to start rolling upgrade"); 7519 startRollingUpgradeInternal(startTime); 7520 } 7521 7522 getEditLog().logStartRollingUpgrade(rollingUpgradeInfo.getStartTime()); 7523 if (haEnabled) { 7524 // roll the edit log to make sure the standby NameNode can tail 7525 getFSImage().rollEditLog(); 7526 } 7527 } finally { 7528 writeUnlock(); 7529 } 7530 7531 getEditLog().logSync(); 7532 if (auditLog.isInfoEnabled() && isExternalInvocation()) { 7533 logAuditEvent(true, "startRollingUpgrade", null, null, null); 7534 } 7535 return rollingUpgradeInfo; 7536 } 7537 7538 /** 7539 * Update internal state to indicate that a rolling upgrade is in progress. 7540 * @param startTime rolling upgrade start time 7541 */ 7542 void startRollingUpgradeInternal(long startTime) 7543 throws IOException { 7544 checkRollingUpgrade("start rolling upgrade"); 7545 getFSImage().checkUpgrade(); 7546 setRollingUpgradeInfo(false, startTime); 7547 } 7548 7549 /** 7550 * Update internal state to indicate that a rolling upgrade is in progress for 7551 * non-HA setup. This requires the namesystem is in SafeMode and after doing a 7552 * checkpoint for rollback the namesystem will quit the safemode automatically 7553 */ 7554 private void startRollingUpgradeInternalForNonHA(long startTime) 7555 throws IOException { 7556 Preconditions.checkState(!haEnabled); 7557 if (!isInSafeMode()) { 7558 throw new IOException("Safe mode should be turned ON " 7559 + "in order to create namespace image."); 7560 } 7561 checkRollingUpgrade("start rolling upgrade"); 7562 getFSImage().checkUpgrade(); 7563 // in non-HA setup, we do an extra checkpoint to generate a rollback image 7564 getFSImage().saveNamespace(this, NameNodeFile.IMAGE_ROLLBACK, null); 7565 LOG.info("Successfully saved namespace for preparing rolling upgrade."); 7566 7567 // leave SafeMode automatically 7568 setSafeMode(SafeModeAction.SAFEMODE_LEAVE); 7569 setRollingUpgradeInfo(true, startTime); 7570 } 7571 7572 void setRollingUpgradeInfo(boolean createdRollbackImages, long startTime) { 7573 rollingUpgradeInfo = new RollingUpgradeInfo(blockPoolId, 7574 createdRollbackImages, startTime, 0L); 7575 } 7576 7577 public void setCreatedRollbackImages(boolean created) { 7578 if (rollingUpgradeInfo != null) { 7579 rollingUpgradeInfo.setCreatedRollbackImages(created); 7580 } 7581 } 7582 7583 public RollingUpgradeInfo getRollingUpgradeInfo() { 7584 return rollingUpgradeInfo; 7585 } 7586 7587 public boolean isNeedRollbackFsImage() { 7588 return needRollbackFsImage; 7589 } 7590 7591 public void setNeedRollbackFsImage(boolean needRollbackFsImage) { 7592 this.needRollbackFsImage = needRollbackFsImage; 7593 } 7594 7595 @Override // NameNodeMXBean 7596 public RollingUpgradeInfo.Bean getRollingUpgradeStatus() { 7597 if (!isRollingUpgrade()) { 7598 return null; 7599 } 7600 RollingUpgradeInfo upgradeInfo = getRollingUpgradeInfo(); 7601 if (upgradeInfo.createdRollbackImages()) { 7602 return new RollingUpgradeInfo.Bean(upgradeInfo); 7603 } 7604 readLock(); 7605 try { 7606 // check again after acquiring the read lock. 7607 upgradeInfo = getRollingUpgradeInfo(); 7608 if (upgradeInfo == null) { 7609 return null; 7610 } 7611 if (!upgradeInfo.createdRollbackImages()) { 7612 boolean hasRollbackImage = this.getFSImage().hasRollbackFSImage(); 7613 upgradeInfo.setCreatedRollbackImages(hasRollbackImage); 7614 } 7615 } catch (IOException ioe) { 7616 LOG.warn("Encountered exception setting Rollback Image", ioe); 7617 } finally { 7618 readUnlock(); 7619 } 7620 return new RollingUpgradeInfo.Bean(upgradeInfo); 7621 } 7622 7623 /** Is rolling upgrade in progress? */ 7624 public boolean isRollingUpgrade() { 7625 return rollingUpgradeInfo != null; 7626 } 7627 7628 void checkRollingUpgrade(String action) throws RollingUpgradeException { 7629 if (isRollingUpgrade()) { 7630 throw new RollingUpgradeException("Failed to " + action 7631 + " since a rolling upgrade is already in progress." 7632 + " Existing rolling upgrade info:\n" + rollingUpgradeInfo); 7633 } 7634 } 7635 7636 RollingUpgradeInfo finalizeRollingUpgrade() throws IOException { 7637 checkSuperuserPrivilege(); 7638 checkOperation(OperationCategory.WRITE); 7639 writeLock(); 7640 final RollingUpgradeInfo returnInfo; 7641 try { 7642 checkOperation(OperationCategory.WRITE); 7643 if (!isRollingUpgrade()) { 7644 return null; 7645 } 7646 checkNameNodeSafeMode("Failed to finalize rolling upgrade"); 7647 7648 returnInfo = finalizeRollingUpgradeInternal(now()); 7649 getEditLog().logFinalizeRollingUpgrade(returnInfo.getFinalizeTime()); 7650 if (haEnabled) { 7651 // roll the edit log to make sure the standby NameNode can tail 7652 getFSImage().rollEditLog(); 7653 } 7654 getFSImage().updateStorageVersion(); 7655 getFSImage().renameCheckpoint(NameNodeFile.IMAGE_ROLLBACK, 7656 NameNodeFile.IMAGE); 7657 } finally { 7658 writeUnlock(); 7659 } 7660 7661 if (!haEnabled) { 7662 // Sync not needed for ha since the edit was rolled after logging. 7663 getEditLog().logSync(); 7664 } 7665 7666 if (auditLog.isInfoEnabled() && isExternalInvocation()) { 7667 logAuditEvent(true, "finalizeRollingUpgrade", null, null, null); 7668 } 7669 return returnInfo; 7670 } 7671 7672 RollingUpgradeInfo finalizeRollingUpgradeInternal(long finalizeTime) 7673 throws RollingUpgradeException { 7674 final long startTime = rollingUpgradeInfo.getStartTime(); 7675 rollingUpgradeInfo = null; 7676 return new RollingUpgradeInfo(blockPoolId, false, startTime, finalizeTime); 7677 } 7678 7679 long addCacheDirective(CacheDirectiveInfo directive, 7680 EnumSet<CacheFlag> flags, boolean logRetryCache) 7681 throws IOException { 7682 checkOperation(OperationCategory.WRITE); 7683 CacheDirectiveInfo effectiveDirective = null; 7684 if (!flags.contains(CacheFlag.FORCE)) { 7685 cacheManager.waitForRescanIfNeeded(); 7686 } 7687 writeLock(); 7688 try { 7689 checkOperation(OperationCategory.WRITE); 7690 if (isInSafeMode()) { 7691 throw new SafeModeException( 7692 "Cannot add cache directive", safeMode); 7693 } 7694 effectiveDirective = FSNDNCacheOp.addCacheDirective(this, cacheManager, 7695 directive, flags, logRetryCache); 7696 } finally { 7697 writeUnlock(); 7698 boolean success = effectiveDirective != null; 7699 if (success) { 7700 getEditLog().logSync(); 7701 } 7702 7703 String effectiveDirectiveStr = effectiveDirective != null ? 7704 effectiveDirective.toString() : null; 7705 logAuditEvent(success, "addCacheDirective", effectiveDirectiveStr, 7706 null, null); 7707 } 7708 return effectiveDirective != null ? effectiveDirective.getId() : 0; 7709 } 7710 7711 void modifyCacheDirective(CacheDirectiveInfo directive, 7712 EnumSet<CacheFlag> flags, boolean logRetryCache) throws IOException { 7713 checkOperation(OperationCategory.WRITE); 7714 boolean success = false; 7715 if (!flags.contains(CacheFlag.FORCE)) { 7716 cacheManager.waitForRescanIfNeeded(); 7717 } 7718 writeLock(); 7719 try { 7720 checkOperation(OperationCategory.WRITE); 7721 if (isInSafeMode()) { 7722 throw new SafeModeException( 7723 "Cannot add cache directive", safeMode); 7724 } 7725 FSNDNCacheOp.modifyCacheDirective(this, cacheManager, directive, flags, 7726 logRetryCache); 7727 success = true; 7728 } finally { 7729 writeUnlock(); 7730 if (success) { 7731 getEditLog().logSync(); 7732 } 7733 String idStr = "{id: " + directive.getId().toString() + "}"; 7734 logAuditEvent(success, "modifyCacheDirective", idStr, 7735 directive.toString(), null); 7736 } 7737 } 7738 7739 void removeCacheDirective(long id, boolean logRetryCache) throws IOException { 7740 checkOperation(OperationCategory.WRITE); 7741 boolean success = false; 7742 writeLock(); 7743 try { 7744 checkOperation(OperationCategory.WRITE); 7745 if (isInSafeMode()) { 7746 throw new SafeModeException( 7747 "Cannot remove cache directives", safeMode); 7748 } 7749 FSNDNCacheOp.removeCacheDirective(this, cacheManager, id, logRetryCache); 7750 success = true; 7751 } finally { 7752 writeUnlock(); 7753 String idStr = "{id: " + Long.toString(id) + "}"; 7754 logAuditEvent(success, "removeCacheDirective", idStr, null, 7755 null); 7756 } 7757 getEditLog().logSync(); 7758 } 7759 7760 BatchedListEntries<CacheDirectiveEntry> listCacheDirectives( 7761 long startId, CacheDirectiveInfo filter) throws IOException { 7762 checkOperation(OperationCategory.READ); 7763 BatchedListEntries<CacheDirectiveEntry> results; 7764 cacheManager.waitForRescanIfNeeded(); 7765 readLock(); 7766 boolean success = false; 7767 try { 7768 checkOperation(OperationCategory.READ); 7769 results = FSNDNCacheOp.listCacheDirectives(this, cacheManager, startId, 7770 filter); 7771 success = true; 7772 } finally { 7773 readUnlock(); 7774 logAuditEvent(success, "listCacheDirectives", filter.toString(), null, 7775 null); 7776 } 7777 return results; 7778 } 7779 7780 void addCachePool(CachePoolInfo req, boolean logRetryCache) 7781 throws IOException { 7782 checkOperation(OperationCategory.WRITE); 7783 writeLock(); 7784 boolean success = false; 7785 String poolInfoStr = null; 7786 try { 7787 checkOperation(OperationCategory.WRITE); 7788 if (isInSafeMode()) { 7789 throw new SafeModeException( 7790 "Cannot add cache pool " + req.getPoolName(), safeMode); 7791 } 7792 CachePoolInfo info = FSNDNCacheOp.addCachePool(this, cacheManager, req, 7793 logRetryCache); 7794 poolInfoStr = info.toString(); 7795 success = true; 7796 } finally { 7797 writeUnlock(); 7798 logAuditEvent(success, "addCachePool", poolInfoStr, null, null); 7799 } 7800 7801 getEditLog().logSync(); 7802 } 7803 7804 void modifyCachePool(CachePoolInfo req, boolean logRetryCache) 7805 throws IOException { 7806 checkOperation(OperationCategory.WRITE); 7807 writeLock(); 7808 boolean success = false; 7809 try { 7810 checkOperation(OperationCategory.WRITE); 7811 if (isInSafeMode()) { 7812 throw new SafeModeException( 7813 "Cannot modify cache pool " + req.getPoolName(), safeMode); 7814 } 7815 FSNDNCacheOp.modifyCachePool(this, cacheManager, req, logRetryCache); 7816 success = true; 7817 } finally { 7818 writeUnlock(); 7819 String poolNameStr = "{poolName: " + 7820 (req == null ? null : req.getPoolName()) + "}"; 7821 logAuditEvent(success, "modifyCachePool", poolNameStr, 7822 req == null ? null : req.toString(), null); 7823 } 7824 7825 getEditLog().logSync(); 7826 } 7827 7828 void removeCachePool(String cachePoolName, boolean logRetryCache) 7829 throws IOException { 7830 checkOperation(OperationCategory.WRITE); 7831 writeLock(); 7832 boolean success = false; 7833 try { 7834 checkOperation(OperationCategory.WRITE); 7835 if (isInSafeMode()) { 7836 throw new SafeModeException( 7837 "Cannot remove cache pool " + cachePoolName, safeMode); 7838 } 7839 FSNDNCacheOp.removeCachePool(this, cacheManager, cachePoolName, 7840 logRetryCache); 7841 success = true; 7842 } finally { 7843 writeUnlock(); 7844 String poolNameStr = "{poolName: " + cachePoolName + "}"; 7845 logAuditEvent(success, "removeCachePool", poolNameStr, null, null); 7846 } 7847 7848 getEditLog().logSync(); 7849 } 7850 7851 BatchedListEntries<CachePoolEntry> listCachePools(String prevKey) 7852 throws IOException { 7853 BatchedListEntries<CachePoolEntry> results; 7854 checkOperation(OperationCategory.READ); 7855 boolean success = false; 7856 cacheManager.waitForRescanIfNeeded(); 7857 readLock(); 7858 try { 7859 checkOperation(OperationCategory.READ); 7860 results = FSNDNCacheOp.listCachePools(this, cacheManager, prevKey); 7861 success = true; 7862 } finally { 7863 readUnlock(); 7864 logAuditEvent(success, "listCachePools", null, null, null); 7865 } 7866 return results; 7867 } 7868 7869 void modifyAclEntries(final String src, List<AclEntry> aclSpec) 7870 throws IOException { 7871 HdfsFileStatus auditStat = null; 7872 checkOperation(OperationCategory.WRITE); 7873 writeLock(); 7874 try { 7875 checkOperation(OperationCategory.WRITE); 7876 checkNameNodeSafeMode("Cannot modify ACL entries on " + src); 7877 auditStat = FSDirAclOp.modifyAclEntries(dir, src, aclSpec); 7878 } catch (AccessControlException e) { 7879 logAuditEvent(false, "modifyAclEntries", src); 7880 throw e; 7881 } finally { 7882 writeUnlock(); 7883 } 7884 getEditLog().logSync(); 7885 logAuditEvent(true, "modifyAclEntries", src, null, auditStat); 7886 } 7887 7888 void removeAclEntries(final String src, List<AclEntry> aclSpec) 7889 throws IOException { 7890 checkOperation(OperationCategory.WRITE); 7891 HdfsFileStatus auditStat = null; 7892 writeLock(); 7893 try { 7894 checkOperation(OperationCategory.WRITE); 7895 checkNameNodeSafeMode("Cannot remove ACL entries on " + src); 7896 auditStat = FSDirAclOp.removeAclEntries(dir, src, aclSpec); 7897 } catch (AccessControlException e) { 7898 logAuditEvent(false, "removeAclEntries", src); 7899 throw e; 7900 } finally { 7901 writeUnlock(); 7902 } 7903 getEditLog().logSync(); 7904 logAuditEvent(true, "removeAclEntries", src, null, auditStat); 7905 } 7906 7907 void removeDefaultAcl(final String src) throws IOException { 7908 HdfsFileStatus auditStat = null; 7909 checkOperation(OperationCategory.WRITE); 7910 writeLock(); 7911 try { 7912 checkOperation(OperationCategory.WRITE); 7913 checkNameNodeSafeMode("Cannot remove default ACL entries on " + src); 7914 auditStat = FSDirAclOp.removeDefaultAcl(dir, src); 7915 } catch (AccessControlException e) { 7916 logAuditEvent(false, "removeDefaultAcl", src); 7917 throw e; 7918 } finally { 7919 writeUnlock(); 7920 } 7921 getEditLog().logSync(); 7922 logAuditEvent(true, "removeDefaultAcl", src, null, auditStat); 7923 } 7924 7925 void removeAcl(final String src) throws IOException { 7926 HdfsFileStatus auditStat = null; 7927 checkOperation(OperationCategory.WRITE); 7928 writeLock(); 7929 try { 7930 checkOperation(OperationCategory.WRITE); 7931 checkNameNodeSafeMode("Cannot remove ACL on " + src); 7932 auditStat = FSDirAclOp.removeAcl(dir, src); 7933 } catch (AccessControlException e) { 7934 logAuditEvent(false, "removeAcl", src); 7935 throw e; 7936 } finally { 7937 writeUnlock(); 7938 } 7939 getEditLog().logSync(); 7940 logAuditEvent(true, "removeAcl", src, null, auditStat); 7941 } 7942 7943 void setAcl(final String src, List<AclEntry> aclSpec) throws IOException { 7944 HdfsFileStatus auditStat = null; 7945 checkOperation(OperationCategory.WRITE); 7946 writeLock(); 7947 try { 7948 checkOperation(OperationCategory.WRITE); 7949 checkNameNodeSafeMode("Cannot set ACL on " + src); 7950 auditStat = FSDirAclOp.setAcl(dir, src, aclSpec); 7951 } catch (AccessControlException e) { 7952 logAuditEvent(false, "setAcl", src); 7953 throw e; 7954 } finally { 7955 writeUnlock(); 7956 } 7957 getEditLog().logSync(); 7958 logAuditEvent(true, "setAcl", src, null, auditStat); 7959 } 7960 7961 AclStatus getAclStatus(String src) throws IOException { 7962 checkOperation(OperationCategory.READ); 7963 boolean success = false; 7964 readLock(); 7965 try { 7966 checkOperation(OperationCategory.READ); 7967 final AclStatus ret = FSDirAclOp.getAclStatus(dir, src); 7968 success = true; 7969 return ret; 7970 } finally { 7971 readUnlock(); 7972 logAuditEvent(success, "getAclStatus", src); 7973 } 7974 } 7975 7976 /** 7977 * Create an encryption zone on directory src using the specified key. 7978 * 7979 * @param src the path of a directory which will be the root of the 7980 * encryption zone. The directory must be empty. 7981 * @param keyName name of a key which must be present in the configured 7982 * KeyProvider. 7983 * @throws AccessControlException if the caller is not the superuser. 7984 * @throws UnresolvedLinkException if the path can't be resolved. 7985 * @throws SafeModeException if the Namenode is in safe mode. 7986 */ 7987 void createEncryptionZone(final String src, final String keyName, 7988 boolean logRetryCache) 7989 throws IOException, UnresolvedLinkException, 7990 SafeModeException, AccessControlException { 7991 try { 7992 if (provider == null) { 7993 throw new IOException( 7994 "Can't create an encryption zone for " + src + 7995 " since no key provider is available."); 7996 } 7997 if (keyName == null || keyName.isEmpty()) { 7998 throw new IOException("Must specify a key name when creating an " + 7999 "encryption zone"); 8000 } 8001 KeyProvider.Metadata metadata = provider.getMetadata(keyName); 8002 if (metadata == null) { 8003 /* 8004 * It would be nice if we threw something more specific than 8005 * IOException when the key is not found, but the KeyProvider API 8006 * doesn't provide for that. If that API is ever changed to throw 8007 * something more specific (e.g. UnknownKeyException) then we can 8008 * update this to match it, or better yet, just rethrow the 8009 * KeyProvider's exception. 8010 */ 8011 throw new IOException("Key " + keyName + " doesn't exist."); 8012 } 8013 // If the provider supports pool for EDEKs, this will fill in the pool 8014 generateEncryptedDataEncryptionKey(keyName); 8015 createEncryptionZoneInt(src, metadata.getCipher(), 8016 keyName, logRetryCache); 8017 } catch (AccessControlException e) { 8018 logAuditEvent(false, "createEncryptionZone", src); 8019 throw e; 8020 } 8021 } 8022 8023 private void createEncryptionZoneInt(final String srcArg, String cipher, 8024 String keyName, final boolean logRetryCache) throws IOException { 8025 String src = srcArg; 8026 HdfsFileStatus resultingStat = null; 8027 checkSuperuserPrivilege(); 8028 checkOperation(OperationCategory.WRITE); 8029 final byte[][] pathComponents = 8030 FSDirectory.getPathComponentsForReservedPath(src); 8031 FSPermissionChecker pc = getPermissionChecker(); 8032 writeLock(); 8033 try { 8034 checkSuperuserPrivilege(); 8035 checkOperation(OperationCategory.WRITE); 8036 checkNameNodeSafeMode("Cannot create encryption zone on " + src); 8037 src = dir.resolvePath(pc, src, pathComponents); 8038 8039 final CipherSuite suite = CipherSuite.convert(cipher); 8040 // For now this is hardcoded, as we only support one method. 8041 final CryptoProtocolVersion version = 8042 CryptoProtocolVersion.ENCRYPTION_ZONES; 8043 final XAttr ezXAttr = dir.createEncryptionZone(src, suite, 8044 version, keyName); 8045 List<XAttr> xAttrs = Lists.newArrayListWithCapacity(1); 8046 xAttrs.add(ezXAttr); 8047 getEditLog().logSetXAttrs(src, xAttrs, logRetryCache); 8048 final INodesInPath iip = dir.getINodesInPath4Write(src, false); 8049 resultingStat = dir.getAuditFileInfo(iip); 8050 } finally { 8051 writeUnlock(); 8052 } 8053 getEditLog().logSync(); 8054 logAuditEvent(true, "createEncryptionZone", srcArg, null, resultingStat); 8055 } 8056 8057 /** 8058 * Get the encryption zone for the specified path. 8059 * 8060 * @param srcArg the path of a file or directory to get the EZ for. 8061 * @return the EZ of the of the path or null if none. 8062 * @throws AccessControlException if the caller is not the superuser. 8063 * @throws UnresolvedLinkException if the path can't be resolved. 8064 */ 8065 EncryptionZone getEZForPath(final String srcArg) 8066 throws AccessControlException, UnresolvedLinkException, IOException { 8067 String src = srcArg; 8068 HdfsFileStatus resultingStat = null; 8069 final byte[][] pathComponents = 8070 FSDirectory.getPathComponentsForReservedPath(src); 8071 boolean success = false; 8072 final FSPermissionChecker pc = getPermissionChecker(); 8073 checkOperation(OperationCategory.READ); 8074 readLock(); 8075 try { 8076 checkOperation(OperationCategory.READ); 8077 src = dir.resolvePath(pc, src, pathComponents); 8078 final INodesInPath iip = dir.getINodesInPath(src, true); 8079 if (isPermissionEnabled) { 8080 dir.checkPathAccess(pc, iip, FsAction.READ); 8081 } 8082 final EncryptionZone ret = dir.getEZForPath(iip); 8083 resultingStat = dir.getAuditFileInfo(iip); 8084 success = true; 8085 return ret; 8086 } finally { 8087 readUnlock(); 8088 logAuditEvent(success, "getEZForPath", srcArg, null, resultingStat); 8089 } 8090 } 8091 8092 BatchedListEntries<EncryptionZone> listEncryptionZones(long prevId) 8093 throws IOException { 8094 boolean success = false; 8095 checkSuperuserPrivilege(); 8096 checkOperation(OperationCategory.READ); 8097 readLock(); 8098 try { 8099 checkSuperuserPrivilege(); 8100 checkOperation(OperationCategory.READ); 8101 final BatchedListEntries<EncryptionZone> ret = 8102 dir.listEncryptionZones(prevId); 8103 success = true; 8104 return ret; 8105 } finally { 8106 readUnlock(); 8107 logAuditEvent(success, "listEncryptionZones", null); 8108 } 8109 } 8110 8111 void setXAttr(String src, XAttr xAttr, EnumSet<XAttrSetFlag> flag, 8112 boolean logRetryCache) 8113 throws IOException { 8114 checkOperation(OperationCategory.WRITE); 8115 HdfsFileStatus auditStat = null; 8116 writeLock(); 8117 try { 8118 checkOperation(OperationCategory.WRITE); 8119 checkNameNodeSafeMode("Cannot set XAttr on " + src); 8120 auditStat = FSDirXAttrOp.setXAttr(dir, src, xAttr, flag, logRetryCache); 8121 } catch (AccessControlException e) { 8122 logAuditEvent(false, "setXAttr", src); 8123 throw e; 8124 } finally { 8125 writeUnlock(); 8126 } 8127 getEditLog().logSync(); 8128 logAuditEvent(true, "setXAttr", src, null, auditStat); 8129 } 8130 8131 List<XAttr> getXAttrs(final String src, List<XAttr> xAttrs) 8132 throws IOException { 8133 checkOperation(OperationCategory.READ); 8134 readLock(); 8135 try { 8136 checkOperation(OperationCategory.READ); 8137 return FSDirXAttrOp.getXAttrs(dir, src, xAttrs); 8138 } catch (AccessControlException e) { 8139 logAuditEvent(false, "getXAttrs", src); 8140 throw e; 8141 } finally { 8142 readUnlock(); 8143 } 8144 } 8145 8146 List<XAttr> listXAttrs(String src) throws IOException { 8147 checkOperation(OperationCategory.READ); 8148 readLock(); 8149 try { 8150 checkOperation(OperationCategory.READ); 8151 return FSDirXAttrOp.listXAttrs(dir, src); 8152 } catch (AccessControlException e) { 8153 logAuditEvent(false, "listXAttrs", src); 8154 throw e; 8155 } finally { 8156 readUnlock(); 8157 } 8158 } 8159 8160 void removeXAttr(String src, XAttr xAttr, boolean logRetryCache) 8161 throws IOException { 8162 checkOperation(OperationCategory.WRITE); 8163 HdfsFileStatus auditStat = null; 8164 writeLock(); 8165 try { 8166 checkOperation(OperationCategory.WRITE); 8167 checkNameNodeSafeMode("Cannot remove XAttr entry on " + src); 8168 auditStat = FSDirXAttrOp.removeXAttr(dir, src, xAttr, logRetryCache); 8169 } catch (AccessControlException e) { 8170 logAuditEvent(false, "removeXAttr", src); 8171 throw e; 8172 } finally { 8173 writeUnlock(); 8174 } 8175 getEditLog().logSync(); 8176 logAuditEvent(true, "removeXAttr", src, null, auditStat); 8177 } 8178 8179 void checkAccess(String src, FsAction mode) throws IOException { 8180 checkOperation(OperationCategory.READ); 8181 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src); 8182 readLock(); 8183 try { 8184 checkOperation(OperationCategory.READ); 8185 src = FSDirectory.resolvePath(src, pathComponents, dir); 8186 final INodesInPath iip = dir.getINodesInPath(src, true); 8187 INode inode = iip.getLastINode(); 8188 if (inode == null) { 8189 throw new FileNotFoundException("Path not found"); 8190 } 8191 if (isPermissionEnabled) { 8192 FSPermissionChecker pc = getPermissionChecker(); 8193 dir.checkPathAccess(pc, iip, mode); 8194 } 8195 } catch (AccessControlException e) { 8196 logAuditEvent(false, "checkAccess", src); 8197 throw e; 8198 } finally { 8199 readUnlock(); 8200 } 8201 } 8202 8203 /** 8204 * Default AuditLogger implementation; used when no access logger is 8205 * defined in the config file. It can also be explicitly listed in the 8206 * config file. 8207 */ 8208 private static class DefaultAuditLogger extends HdfsAuditLogger { 8209 8210 private boolean logTokenTrackingId; 8211 8212 @Override 8213 public void initialize(Configuration conf) { 8214 logTokenTrackingId = conf.getBoolean( 8215 DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY, 8216 DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT); 8217 } 8218 8219 @Override 8220 public void logAuditEvent(boolean succeeded, String userName, 8221 InetAddress addr, String cmd, String src, String dst, 8222 FileStatus status, UserGroupInformation ugi, 8223 DelegationTokenSecretManager dtSecretManager) { 8224 if (auditLog.isInfoEnabled()) { 8225 final StringBuilder sb = auditBuffer.get(); 8226 sb.setLength(0); 8227 sb.append("allowed=").append(succeeded).append("\t"); 8228 sb.append("ugi=").append(userName).append("\t"); 8229 sb.append("ip=").append(addr).append("\t"); 8230 sb.append("cmd=").append(cmd).append("\t"); 8231 sb.append("src=").append(src).append("\t"); 8232 sb.append("dst=").append(dst).append("\t"); 8233 if (null == status) { 8234 sb.append("perm=null"); 8235 } else { 8236 sb.append("perm="); 8237 sb.append(status.getOwner()).append(":"); 8238 sb.append(status.getGroup()).append(":"); 8239 sb.append(status.getPermission()); 8240 } 8241 if (logTokenTrackingId) { 8242 sb.append("\t").append("trackingId="); 8243 String trackingId = null; 8244 if (ugi != null && dtSecretManager != null 8245 && ugi.getAuthenticationMethod() == AuthenticationMethod.TOKEN) { 8246 for (TokenIdentifier tid: ugi.getTokenIdentifiers()) { 8247 if (tid instanceof DelegationTokenIdentifier) { 8248 DelegationTokenIdentifier dtid = 8249 (DelegationTokenIdentifier)tid; 8250 trackingId = dtSecretManager.getTokenTrackingId(dtid); 8251 break; 8252 } 8253 } 8254 } 8255 sb.append(trackingId); 8256 } 8257 sb.append("\t").append("proto="); 8258 sb.append(NamenodeWebHdfsMethods.isWebHdfsInvocation() ? "webhdfs" : "rpc"); 8259 logAuditMessage(sb.toString()); 8260 } 8261 } 8262 8263 public void logAuditMessage(String message) { 8264 auditLog.info(message); 8265 } 8266 } 8267 8268 private static void enableAsyncAuditLog() { 8269 if (!(auditLog instanceof Log4JLogger)) { 8270 LOG.warn("Log4j is required to enable async auditlog"); 8271 return; 8272 } 8273 Logger logger = ((Log4JLogger)auditLog).getLogger(); 8274 @SuppressWarnings("unchecked") 8275 List<Appender> appenders = Collections.list(logger.getAllAppenders()); 8276 // failsafe against trying to async it more than once 8277 if (!appenders.isEmpty() && !(appenders.get(0) instanceof AsyncAppender)) { 8278 AsyncAppender asyncAppender = new AsyncAppender(); 8279 // change logger to have an async appender containing all the 8280 // previously configured appenders 8281 for (Appender appender : appenders) { 8282 logger.removeAppender(appender); 8283 asyncAppender.addAppender(appender); 8284 } 8285 logger.addAppender(asyncAppender); 8286 } 8287 } 8288 8289} 8290