001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.regionserver.wal; 019 020import static org.apache.hadoop.hbase.wal.AbstractFSWALProvider.WAL_FILE_NAME_DELIMITER; 021import static org.apache.hbase.thirdparty.com.google.common.base.Preconditions.checkArgument; 022import static org.apache.hbase.thirdparty.com.google.common.base.Preconditions.checkNotNull; 023 024import com.lmax.disruptor.RingBuffer; 025import java.io.FileNotFoundException; 026import java.io.IOException; 027import java.io.InterruptedIOException; 028import java.lang.management.MemoryType; 029import java.net.URLEncoder; 030import java.util.ArrayList; 031import java.util.Arrays; 032import java.util.Comparator; 033import java.util.List; 034import java.util.Map; 035import java.util.OptionalLong; 036import java.util.Set; 037import java.util.concurrent.ConcurrentNavigableMap; 038import java.util.concurrent.ConcurrentSkipListMap; 039import java.util.concurrent.CopyOnWriteArrayList; 040import java.util.concurrent.ExecutionException; 041import java.util.concurrent.TimeUnit; 042import java.util.concurrent.atomic.AtomicBoolean; 043import java.util.concurrent.atomic.AtomicInteger; 044import java.util.concurrent.atomic.AtomicLong; 045import java.util.concurrent.locks.ReentrantLock; 046import org.apache.commons.lang3.mutable.MutableLong; 047import org.apache.hadoop.conf.Configuration; 048import org.apache.hadoop.fs.FileStatus; 049import org.apache.hadoop.fs.FileSystem; 050import org.apache.hadoop.fs.Path; 051import org.apache.hadoop.fs.PathFilter; 052import org.apache.hadoop.hbase.Cell; 053import org.apache.hadoop.hbase.HBaseConfiguration; 054import org.apache.hadoop.hbase.HConstants; 055import org.apache.hadoop.hbase.PrivateCellUtil; 056import org.apache.hadoop.hbase.client.RegionInfo; 057import org.apache.hadoop.hbase.exceptions.TimeoutIOException; 058import org.apache.hadoop.hbase.io.util.MemorySizeUtil; 059import org.apache.hadoop.hbase.log.HBaseMarkers; 060import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl; 061import org.apache.hadoop.hbase.trace.TraceUtil; 062import org.apache.hadoop.hbase.util.Bytes; 063import org.apache.hadoop.hbase.util.CommonFSUtils; 064import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; 065import org.apache.hadoop.hbase.util.FSUtils; 066import org.apache.hadoop.hbase.util.Pair; 067import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; 068import org.apache.hadoop.hbase.wal.WAL; 069import org.apache.hadoop.hbase.wal.WALEdit; 070import org.apache.hadoop.hbase.wal.WALFactory; 071import org.apache.hadoop.hbase.wal.WALKeyImpl; 072import org.apache.hadoop.hbase.wal.WALPrettyPrinter; 073import org.apache.hadoop.hbase.wal.WALProvider.WriterBase; 074import org.apache.hadoop.hbase.wal.WALSplitter; 075import org.apache.hadoop.hdfs.protocol.DatanodeInfo; 076import org.apache.hadoop.util.StringUtils; 077import org.apache.htrace.core.TraceScope; 078import org.apache.yetus.audience.InterfaceAudience; 079import org.slf4j.Logger; 080import org.slf4j.LoggerFactory; 081 082import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting; 083 084/** 085 * Implementation of {@link WAL} to go against {@link FileSystem}; i.e. keep WALs in HDFS. Only one 086 * WAL is ever being written at a time. When a WAL hits a configured maximum size, it is rolled. 087 * This is done internal to the implementation. 088 * <p> 089 * As data is flushed from the MemStore to other on-disk structures (files sorted by key, hfiles), a 090 * WAL becomes obsolete. We can let go of all the log edits/entries for a given HRegion-sequence id. 091 * A bunch of work in the below is done keeping account of these region sequence ids -- what is 092 * flushed out to hfiles, and what is yet in WAL and in memory only. 093 * <p> 094 * It is only practical to delete entire files. Thus, we delete an entire on-disk file 095 * <code>F</code> when all of the edits in <code>F</code> have a log-sequence-id that's older 096 * (smaller) than the most-recent flush. 097 * <p> 098 * To read an WAL, call 099 * {@link WALFactory#createReader(org.apache.hadoop.fs.FileSystem, org.apache.hadoop.fs.Path)}. * 100 * <h2>Failure Semantic</h2> If an exception on append or sync, roll the WAL because the current WAL 101 * is now a lame duck; any more appends or syncs will fail also with the same original exception. If 102 * we have made successful appends to the WAL and we then are unable to sync them, our current 103 * semantic is to return error to the client that the appends failed but also to abort the current 104 * context, usually the hosting server. We need to replay the WALs. <br> 105 * TODO: Change this semantic. A roll of WAL may be sufficient as long as we have flagged client 106 * that the append failed. <br> 107 * TODO: replication may pick up these last edits though they have been marked as failed append 108 * (Need to keep our own file lengths, not rely on HDFS). 109 */ 110@InterfaceAudience.Private 111public abstract class AbstractFSWAL<W extends WriterBase> implements WAL { 112 113 private static final Logger LOG = LoggerFactory.getLogger(AbstractFSWAL.class); 114 115 protected static final int DEFAULT_SLOW_SYNC_TIME_MS = 100; // in ms 116 117 private static final int DEFAULT_WAL_SYNC_TIMEOUT_MS = 5 * 60 * 1000; // in ms, 5min 118 119 /** 120 * file system instance 121 */ 122 protected final FileSystem fs; 123 124 /** 125 * WAL directory, where all WAL files would be placed. 126 */ 127 protected final Path walDir; 128 129 /** 130 * dir path where old logs are kept. 131 */ 132 protected final Path walArchiveDir; 133 134 /** 135 * Matches just those wal files that belong to this wal instance. 136 */ 137 protected final PathFilter ourFiles; 138 139 /** 140 * Prefix of a WAL file, usually the region server name it is hosted on. 141 */ 142 protected final String walFilePrefix; 143 144 /** 145 * Suffix included on generated wal file names 146 */ 147 protected final String walFileSuffix; 148 149 /** 150 * Prefix used when checking for wal membership. 151 */ 152 protected final String prefixPathStr; 153 154 protected final WALCoprocessorHost coprocessorHost; 155 156 /** 157 * conf object 158 */ 159 protected final Configuration conf; 160 161 /** Listeners that are called on WAL events. */ 162 protected final List<WALActionsListener> listeners = new CopyOnWriteArrayList<>(); 163 164 /** 165 * Class that does accounting of sequenceids in WAL subsystem. Holds oldest outstanding sequence 166 * id as yet not flushed as well as the most recent edit sequence id appended to the WAL. Has 167 * facility for answering questions such as "Is it safe to GC a WAL?". 168 */ 169 protected final SequenceIdAccounting sequenceIdAccounting = new SequenceIdAccounting(); 170 171 protected final long slowSyncNs; 172 173 private final long walSyncTimeoutNs; 174 175 // If > than this size, roll the log. 176 protected final long logrollsize; 177 178 /** 179 * Block size to use writing files. 180 */ 181 protected final long blocksize; 182 183 /* 184 * If more than this many logs, force flush of oldest region to oldest edit goes to disk. If too 185 * many and we crash, then will take forever replaying. Keep the number of logs tidy. 186 */ 187 protected final int maxLogs; 188 189 /** 190 * This lock makes sure only one log roll runs at a time. Should not be taken while any other lock 191 * is held. We don't just use synchronized because that results in bogus and tedious findbugs 192 * warning when it thinks synchronized controls writer thread safety. It is held when we are 193 * actually rolling the log. It is checked when we are looking to see if we should roll the log or 194 * not. 195 */ 196 protected final ReentrantLock rollWriterLock = new ReentrantLock(true); 197 198 // The timestamp (in ms) when the log file was created. 199 protected final AtomicLong filenum = new AtomicLong(-1); 200 201 // Number of transactions in the current Wal. 202 protected final AtomicInteger numEntries = new AtomicInteger(0); 203 204 /** 205 * The highest known outstanding unsync'd WALEdit transaction id. Usually, we use a queue to pass 206 * WALEdit to background consumer thread, and the transaction id is the sequence number of the 207 * corresponding entry in queue. 208 */ 209 protected volatile long highestUnsyncedTxid = -1; 210 211 /** 212 * Updated to the transaction id of the last successful sync call. This can be less than 213 * {@link #highestUnsyncedTxid} for case where we have an append where a sync has not yet come in 214 * for it. 215 */ 216 protected final AtomicLong highestSyncedTxid = new AtomicLong(0); 217 218 /** 219 * The total size of wal 220 */ 221 protected final AtomicLong totalLogSize = new AtomicLong(0); 222 /** 223 * Current log file. 224 */ 225 volatile W writer; 226 227 // Last time to check low replication on hlog's pipeline 228 private long lastTimeCheckLowReplication = EnvironmentEdgeManager.currentTime(); 229 230 protected volatile boolean closed = false; 231 232 protected final AtomicBoolean shutdown = new AtomicBoolean(false); 233 /** 234 * WAL Comparator; it compares the timestamp (log filenum), present in the log file name. Throws 235 * an IllegalArgumentException if used to compare paths from different wals. 236 */ 237 final Comparator<Path> LOG_NAME_COMPARATOR = 238 (o1, o2) -> Long.compare(getFileNumFromFileName(o1), getFileNumFromFileName(o2)); 239 240 private static final class WalProps { 241 242 /** 243 * Map the encoded region name to the highest sequence id. Contain all the regions it has 244 * entries of 245 */ 246 public final Map<byte[], Long> encodedName2HighestSequenceId; 247 248 /** 249 * The log file size. Notice that the size may not be accurate if we do asynchronous close in 250 * sub classes. 251 */ 252 public final long logSize; 253 254 public WalProps(Map<byte[], Long> encodedName2HighestSequenceId, long logSize) { 255 this.encodedName2HighestSequenceId = encodedName2HighestSequenceId; 256 this.logSize = logSize; 257 } 258 } 259 260 /** 261 * Map of WAL log file to properties. The map is sorted by the log file creation timestamp 262 * (contained in the log file name). 263 */ 264 protected ConcurrentNavigableMap<Path, WalProps> walFile2Props = 265 new ConcurrentSkipListMap<>(LOG_NAME_COMPARATOR); 266 267 /** 268 * Map of {@link SyncFuture}s owned by Thread objects. Used so we reuse SyncFutures. 269 * Thread local is used so JVM can GC the terminated thread for us. See HBASE-21228 270 * <p> 271 */ 272 private final ThreadLocal<SyncFuture> cachedSyncFutures; 273 274 /** 275 * The class name of the runtime implementation, used as prefix for logging/tracing. 276 * <p> 277 * Performance testing shows getClass().getSimpleName() might be a bottleneck so we store it here, 278 * refer to HBASE-17676 for more details 279 * </p> 280 */ 281 protected final String implClassName; 282 283 public long getFilenum() { 284 return this.filenum.get(); 285 } 286 287 /** 288 * A log file has a creation timestamp (in ms) in its file name ({@link #filenum}. This helper 289 * method returns the creation timestamp from a given log file. It extracts the timestamp assuming 290 * the filename is created with the {@link #computeFilename(long filenum)} method. 291 * @return timestamp, as in the log file name. 292 */ 293 protected long getFileNumFromFileName(Path fileName) { 294 checkNotNull(fileName, "file name can't be null"); 295 if (!ourFiles.accept(fileName)) { 296 throw new IllegalArgumentException( 297 "The log file " + fileName + " doesn't belong to this WAL. (" + toString() + ")"); 298 } 299 final String fileNameString = fileName.toString(); 300 String chompedPath = fileNameString.substring(prefixPathStr.length(), 301 (fileNameString.length() - walFileSuffix.length())); 302 return Long.parseLong(chompedPath); 303 } 304 305 private int calculateMaxLogFiles(Configuration conf, long logRollSize) { 306 Pair<Long, MemoryType> globalMemstoreSize = MemorySizeUtil.getGlobalMemStoreSize(conf); 307 return (int) ((globalMemstoreSize.getFirst() * 2) / logRollSize); 308 } 309 310 // must be power of 2 311 protected final int getPreallocatedEventCount() { 312 // Preallocate objects to use on the ring buffer. The way that appends and syncs work, we will 313 // be stuck and make no progress if the buffer is filled with appends only and there is no 314 // sync. If no sync, then the handlers will be outstanding just waiting on sync completion 315 // before they return. 316 int preallocatedEventCount = 317 this.conf.getInt("hbase.regionserver.wal.disruptor.event.count", 1024 * 16); 318 checkArgument(preallocatedEventCount >= 0, 319 "hbase.regionserver.wal.disruptor.event.count must > 0"); 320 int floor = Integer.highestOneBit(preallocatedEventCount); 321 if (floor == preallocatedEventCount) { 322 return floor; 323 } 324 // max capacity is 1 << 30 325 if (floor >= 1 << 29) { 326 return 1 << 30; 327 } 328 return floor << 1; 329 } 330 331 protected AbstractFSWAL(final FileSystem fs, final Path rootDir, final String logDir, 332 final String archiveDir, final Configuration conf, final List<WALActionsListener> listeners, 333 final boolean failIfWALExists, final String prefix, final String suffix) 334 throws FailedLogCloseException, IOException { 335 this.fs = fs; 336 this.walDir = new Path(rootDir, logDir); 337 this.walArchiveDir = new Path(rootDir, archiveDir); 338 this.conf = conf; 339 340 if (!fs.exists(walDir) && !fs.mkdirs(walDir)) { 341 throw new IOException("Unable to mkdir " + walDir); 342 } 343 344 if (!fs.exists(this.walArchiveDir)) { 345 if (!fs.mkdirs(this.walArchiveDir)) { 346 throw new IOException("Unable to mkdir " + this.walArchiveDir); 347 } 348 } 349 350 // If prefix is null||empty then just name it wal 351 this.walFilePrefix = 352 prefix == null || prefix.isEmpty() ? "wal" : URLEncoder.encode(prefix, "UTF8"); 353 // we only correctly differentiate suffices when numeric ones start with '.' 354 if (suffix != null && !(suffix.isEmpty()) && !(suffix.startsWith(WAL_FILE_NAME_DELIMITER))) { 355 throw new IllegalArgumentException("WAL suffix must start with '" + WAL_FILE_NAME_DELIMITER + 356 "' but instead was '" + suffix + "'"); 357 } 358 // Now that it exists, set the storage policy for the entire directory of wal files related to 359 // this FSHLog instance 360 String storagePolicy = 361 conf.get(HConstants.WAL_STORAGE_POLICY, HConstants.DEFAULT_WAL_STORAGE_POLICY); 362 CommonFSUtils.setStoragePolicy(fs, this.walDir, storagePolicy); 363 this.walFileSuffix = (suffix == null) ? "" : URLEncoder.encode(suffix, "UTF8"); 364 this.prefixPathStr = new Path(walDir, walFilePrefix + WAL_FILE_NAME_DELIMITER).toString(); 365 366 this.ourFiles = new PathFilter() { 367 @Override 368 public boolean accept(final Path fileName) { 369 // The path should start with dir/<prefix> and end with our suffix 370 final String fileNameString = fileName.toString(); 371 if (!fileNameString.startsWith(prefixPathStr)) { 372 return false; 373 } 374 if (walFileSuffix.isEmpty()) { 375 // in the case of the null suffix, we need to ensure the filename ends with a timestamp. 376 return org.apache.commons.lang3.StringUtils 377 .isNumeric(fileNameString.substring(prefixPathStr.length())); 378 } else if (!fileNameString.endsWith(walFileSuffix)) { 379 return false; 380 } 381 return true; 382 } 383 }; 384 385 if (failIfWALExists) { 386 final FileStatus[] walFiles = CommonFSUtils.listStatus(fs, walDir, ourFiles); 387 if (null != walFiles && 0 != walFiles.length) { 388 throw new IOException("Target WAL already exists within directory " + walDir); 389 } 390 } 391 392 // Register listeners. TODO: Should this exist anymore? We have CPs? 393 if (listeners != null) { 394 for (WALActionsListener i : listeners) { 395 registerWALActionsListener(i); 396 } 397 } 398 this.coprocessorHost = new WALCoprocessorHost(this, conf); 399 400 // Schedule a WAL roll when the WAL is 50% of the HDFS block size. Scheduling at 50% of block 401 // size should make it so WAL rolls before we get to the end-of-block (Block transitions cost 402 // some latency). In hbase-1 we did this differently. We scheduled a roll when we hit 95% of 403 // the block size but experience from the field has it that this was not enough time for the 404 // roll to happen before end-of-block. So the new accounting makes WALs of about the same 405 // size as those made in hbase-1 (to prevent surprise), we now have default block size as 406 // 2 times the DFS default: i.e. 2 * DFS default block size rolling at 50% full will generally 407 // make similar size logs to 1 * DFS default block size rolling at 95% full. See HBASE-19148. 408 this.blocksize = WALUtil.getWALBlockSize(this.conf, this.fs, this.walDir); 409 float multiplier = conf.getFloat("hbase.regionserver.logroll.multiplier", 0.5f); 410 this.logrollsize = (long)(this.blocksize * multiplier); 411 this.maxLogs = conf.getInt("hbase.regionserver.maxlogs", 412 Math.max(32, calculateMaxLogFiles(conf, logrollsize))); 413 414 LOG.info("WAL configuration: blocksize=" + StringUtils.byteDesc(blocksize) + ", rollsize=" + 415 StringUtils.byteDesc(this.logrollsize) + ", prefix=" + this.walFilePrefix + ", suffix=" + 416 walFileSuffix + ", logDir=" + this.walDir + ", archiveDir=" + this.walArchiveDir); 417 this.slowSyncNs = TimeUnit.MILLISECONDS 418 .toNanos(conf.getInt("hbase.regionserver.hlog.slowsync.ms", DEFAULT_SLOW_SYNC_TIME_MS)); 419 this.walSyncTimeoutNs = TimeUnit.MILLISECONDS 420 .toNanos(conf.getLong("hbase.regionserver.hlog.sync.timeout", DEFAULT_WAL_SYNC_TIMEOUT_MS)); 421 this.cachedSyncFutures = new ThreadLocal<SyncFuture>() { 422 @Override 423 protected SyncFuture initialValue() { 424 return new SyncFuture(); 425 } 426 }; 427 this.implClassName = getClass().getSimpleName(); 428 } 429 430 @Override 431 public void registerWALActionsListener(WALActionsListener listener) { 432 this.listeners.add(listener); 433 } 434 435 @Override 436 public boolean unregisterWALActionsListener(WALActionsListener listener) { 437 return this.listeners.remove(listener); 438 } 439 440 @Override 441 public WALCoprocessorHost getCoprocessorHost() { 442 return coprocessorHost; 443 } 444 445 @Override 446 public Long startCacheFlush(byte[] encodedRegionName, Set<byte[]> families) { 447 return this.sequenceIdAccounting.startCacheFlush(encodedRegionName, families); 448 } 449 450 @Override 451 public Long startCacheFlush(byte[] encodedRegionName, Map<byte[], Long> familyToSeq) { 452 return this.sequenceIdAccounting.startCacheFlush(encodedRegionName, familyToSeq); 453 } 454 455 @Override 456 public void completeCacheFlush(byte[] encodedRegionName) { 457 this.sequenceIdAccounting.completeCacheFlush(encodedRegionName); 458 } 459 460 @Override 461 public void abortCacheFlush(byte[] encodedRegionName) { 462 this.sequenceIdAccounting.abortCacheFlush(encodedRegionName); 463 } 464 465 @Override 466 public long getEarliestMemStoreSeqNum(byte[] encodedRegionName) { 467 // Used by tests. Deprecated as too subtle for general usage. 468 return this.sequenceIdAccounting.getLowestSequenceId(encodedRegionName); 469 } 470 471 @Override 472 public long getEarliestMemStoreSeqNum(byte[] encodedRegionName, byte[] familyName) { 473 // This method is used by tests and for figuring if we should flush or not because our 474 // sequenceids are too old. It is also used reporting the master our oldest sequenceid for use 475 // figuring what edits can be skipped during log recovery. getEarliestMemStoreSequenceId 476 // from this.sequenceIdAccounting is looking first in flushingOldestStoreSequenceIds, the 477 // currently flushing sequence ids, and if anything found there, it is returning these. This is 478 // the right thing to do for the reporting oldest sequenceids to master; we won't skip edits if 479 // we crash during the flush. For figuring what to flush, we might get requeued if our sequence 480 // id is old even though we are currently flushing. This may mean we do too much flushing. 481 return this.sequenceIdAccounting.getLowestSequenceId(encodedRegionName, familyName); 482 } 483 484 @Override 485 public byte[][] rollWriter() throws FailedLogCloseException, IOException { 486 return rollWriter(false); 487 } 488 489 /** 490 * This is a convenience method that computes a new filename with a given file-number. 491 * @param filenum to use 492 * @return Path 493 */ 494 protected Path computeFilename(final long filenum) { 495 if (filenum < 0) { 496 throw new RuntimeException("WAL file number can't be < 0"); 497 } 498 String child = walFilePrefix + WAL_FILE_NAME_DELIMITER + filenum + walFileSuffix; 499 return new Path(walDir, child); 500 } 501 502 /** 503 * This is a convenience method that computes a new filename with a given using the current WAL 504 * file-number 505 * @return Path 506 */ 507 public Path getCurrentFileName() { 508 return computeFilename(this.filenum.get()); 509 } 510 511 /** 512 * retrieve the next path to use for writing. Increments the internal filenum. 513 */ 514 private Path getNewPath() throws IOException { 515 this.filenum.set(System.currentTimeMillis()); 516 Path newPath = getCurrentFileName(); 517 while (fs.exists(newPath)) { 518 this.filenum.incrementAndGet(); 519 newPath = getCurrentFileName(); 520 } 521 return newPath; 522 } 523 524 @VisibleForTesting 525 Path getOldPath() { 526 long currentFilenum = this.filenum.get(); 527 Path oldPath = null; 528 if (currentFilenum > 0) { 529 // ComputeFilename will take care of meta wal filename 530 oldPath = computeFilename(currentFilenum); 531 } // I presume if currentFilenum is <= 0, this is first file and null for oldPath if fine? 532 return oldPath; 533 } 534 535 /** 536 * Tell listeners about pre log roll. 537 */ 538 private void tellListenersAboutPreLogRoll(final Path oldPath, final Path newPath) 539 throws IOException { 540 coprocessorHost.preWALRoll(oldPath, newPath); 541 542 if (!this.listeners.isEmpty()) { 543 for (WALActionsListener i : this.listeners) { 544 i.preLogRoll(oldPath, newPath); 545 } 546 } 547 } 548 549 /** 550 * Tell listeners about post log roll. 551 */ 552 private void tellListenersAboutPostLogRoll(final Path oldPath, final Path newPath) 553 throws IOException { 554 if (!this.listeners.isEmpty()) { 555 for (WALActionsListener i : this.listeners) { 556 i.postLogRoll(oldPath, newPath); 557 } 558 } 559 560 coprocessorHost.postWALRoll(oldPath, newPath); 561 } 562 563 // public only until class moves to o.a.h.h.wal 564 /** @return the number of rolled log files */ 565 public int getNumRolledLogFiles() { 566 return walFile2Props.size(); 567 } 568 569 // public only until class moves to o.a.h.h.wal 570 /** @return the number of log files in use */ 571 public int getNumLogFiles() { 572 // +1 for current use log 573 return getNumRolledLogFiles() + 1; 574 } 575 576 /** 577 * If the number of un-archived WAL files is greater than maximum allowed, check the first 578 * (oldest) WAL file, and returns those regions which should be flushed so that it can be 579 * archived. 580 * @return regions (encodedRegionNames) to flush in order to archive oldest WAL file. 581 */ 582 byte[][] findRegionsToForceFlush() throws IOException { 583 byte[][] regions = null; 584 int logCount = getNumRolledLogFiles(); 585 if (logCount > this.maxLogs && logCount > 0) { 586 Map.Entry<Path, WalProps> firstWALEntry = this.walFile2Props.firstEntry(); 587 regions = 588 this.sequenceIdAccounting.findLower(firstWALEntry.getValue().encodedName2HighestSequenceId); 589 } 590 if (regions != null) { 591 StringBuilder sb = new StringBuilder(); 592 for (int i = 0; i < regions.length; i++) { 593 if (i > 0) { 594 sb.append(", "); 595 } 596 sb.append(Bytes.toStringBinary(regions[i])); 597 } 598 LOG.info("Too many WALs; count=" + logCount + ", max=" + this.maxLogs + 599 "; forcing flush of " + regions.length + " regions(s): " + sb.toString()); 600 } 601 return regions; 602 } 603 604 /** 605 * Archive old logs. A WAL is eligible for archiving if all its WALEdits have been flushed. 606 */ 607 private void cleanOldLogs() throws IOException { 608 List<Pair<Path, Long>> logsToArchive = null; 609 // For each log file, look at its Map of regions to highest sequence id; if all sequence ids 610 // are older than what is currently in memory, the WAL can be GC'd. 611 for (Map.Entry<Path, WalProps> e : this.walFile2Props.entrySet()) { 612 Path log = e.getKey(); 613 Map<byte[], Long> sequenceNums = e.getValue().encodedName2HighestSequenceId; 614 if (this.sequenceIdAccounting.areAllLower(sequenceNums)) { 615 if (logsToArchive == null) { 616 logsToArchive = new ArrayList<>(); 617 } 618 logsToArchive.add(Pair.newPair(log, e.getValue().logSize)); 619 if (LOG.isTraceEnabled()) { 620 LOG.trace("WAL file ready for archiving " + log); 621 } 622 } 623 } 624 if (logsToArchive != null) { 625 for (Pair<Path, Long> logAndSize : logsToArchive) { 626 this.totalLogSize.addAndGet(-logAndSize.getSecond()); 627 archiveLogFile(logAndSize.getFirst()); 628 this.walFile2Props.remove(logAndSize.getFirst()); 629 } 630 } 631 } 632 633 /* 634 * only public so WALSplitter can use. 635 * @return archived location of a WAL file with the given path p 636 */ 637 public static Path getWALArchivePath(Path archiveDir, Path p) { 638 return new Path(archiveDir, p.getName()); 639 } 640 641 private void archiveLogFile(final Path p) throws IOException { 642 Path newPath = getWALArchivePath(this.walArchiveDir, p); 643 // Tell our listeners that a log is going to be archived. 644 if (!this.listeners.isEmpty()) { 645 for (WALActionsListener i : this.listeners) { 646 i.preLogArchive(p, newPath); 647 } 648 } 649 LOG.info("Archiving " + p + " to " + newPath); 650 if (!CommonFSUtils.renameAndSetModifyTime(this.fs, p, newPath)) { 651 throw new IOException("Unable to rename " + p + " to " + newPath); 652 } 653 // Tell our listeners that a log has been archived. 654 if (!this.listeners.isEmpty()) { 655 for (WALActionsListener i : this.listeners) { 656 i.postLogArchive(p, newPath); 657 } 658 } 659 } 660 661 protected final void logRollAndSetupWalProps(Path oldPath, Path newPath, long oldFileLen) { 662 int oldNumEntries = this.numEntries.getAndSet(0); 663 String newPathString = newPath != null ? CommonFSUtils.getPath(newPath) : null; 664 if (oldPath != null) { 665 this.walFile2Props.put(oldPath, 666 new WalProps(this.sequenceIdAccounting.resetHighest(), oldFileLen)); 667 this.totalLogSize.addAndGet(oldFileLen); 668 LOG.info("Rolled WAL {} with entries={}, filesize={}; new WAL {}", 669 CommonFSUtils.getPath(oldPath), oldNumEntries, StringUtils.byteDesc(oldFileLen), 670 newPathString); 671 } else { 672 LOG.info("New WAL {}", newPathString); 673 } 674 } 675 676 /** 677 * <p> 678 * Cleans up current writer closing it and then puts in place the passed in 679 * <code>nextWriter</code>. 680 * </p> 681 * <p> 682 * <ul> 683 * <li>In the case of creating a new WAL, oldPath will be null.</li> 684 * <li>In the case of rolling over from one file to the next, none of the parameters will be null. 685 * </li> 686 * <li>In the case of closing out this FSHLog with no further use newPath and nextWriter will be 687 * null.</li> 688 * </ul> 689 * </p> 690 * @param oldPath may be null 691 * @param newPath may be null 692 * @param nextWriter may be null 693 * @return the passed in <code>newPath</code> 694 * @throws IOException if there is a problem flushing or closing the underlying FS 695 */ 696 @VisibleForTesting 697 Path replaceWriter(Path oldPath, Path newPath, W nextWriter) throws IOException { 698 try (TraceScope scope = TraceUtil.createTrace("FSHFile.replaceWriter")) { 699 doReplaceWriter(oldPath, newPath, nextWriter); 700 return newPath; 701 } 702 } 703 704 protected final void blockOnSync(SyncFuture syncFuture) throws IOException { 705 // Now we have published the ringbuffer, halt the current thread until we get an answer back. 706 try { 707 if (syncFuture != null) { 708 if (closed) { 709 throw new IOException("WAL has been closed"); 710 } else { 711 syncFuture.get(walSyncTimeoutNs); 712 } 713 } 714 } catch (TimeoutIOException tioe) { 715 // SyncFuture reuse by thread, if TimeoutIOException happens, ringbuffer 716 // still refer to it, so if this thread use it next time may get a wrong 717 // result. 718 this.cachedSyncFutures.remove(); 719 throw tioe; 720 } catch (InterruptedException ie) { 721 LOG.warn("Interrupted", ie); 722 throw convertInterruptedExceptionToIOException(ie); 723 } catch (ExecutionException e) { 724 throw ensureIOException(e.getCause()); 725 } 726 } 727 728 private static IOException ensureIOException(final Throwable t) { 729 return (t instanceof IOException) ? (IOException) t : new IOException(t); 730 } 731 732 private IOException convertInterruptedExceptionToIOException(final InterruptedException ie) { 733 Thread.currentThread().interrupt(); 734 IOException ioe = new InterruptedIOException(); 735 ioe.initCause(ie); 736 return ioe; 737 } 738 739 @Override 740 public byte[][] rollWriter(boolean force) throws FailedLogCloseException, IOException { 741 rollWriterLock.lock(); 742 try { 743 // Return if nothing to flush. 744 if (!force && this.writer != null && this.numEntries.get() <= 0) { 745 return null; 746 } 747 byte[][] regionsToFlush = null; 748 if (this.closed) { 749 LOG.debug("WAL closed. Skipping rolling of writer"); 750 return regionsToFlush; 751 } 752 try (TraceScope scope = TraceUtil.createTrace("FSHLog.rollWriter")) { 753 Path oldPath = getOldPath(); 754 Path newPath = getNewPath(); 755 // Any exception from here on is catastrophic, non-recoverable so we currently abort. 756 W nextWriter = this.createWriterInstance(newPath); 757 tellListenersAboutPreLogRoll(oldPath, newPath); 758 // NewPath could be equal to oldPath if replaceWriter fails. 759 newPath = replaceWriter(oldPath, newPath, nextWriter); 760 tellListenersAboutPostLogRoll(oldPath, newPath); 761 if (LOG.isDebugEnabled()) { 762 LOG.debug("Create new " + implClassName + " writer with pipeline: " + 763 Arrays.toString(getPipeline())); 764 } 765 // Can we delete any of the old log files? 766 if (getNumRolledLogFiles() > 0) { 767 cleanOldLogs(); 768 regionsToFlush = findRegionsToForceFlush(); 769 } 770 } catch (CommonFSUtils.StreamLacksCapabilityException exception) { 771 // If the underlying FileSystem can't do what we ask, treat as IO failure so 772 // we'll abort. 773 throw new IOException( 774 "Underlying FileSystem can't meet stream requirements. See RS log " + "for details.", 775 exception); 776 } 777 return regionsToFlush; 778 } finally { 779 rollWriterLock.unlock(); 780 } 781 } 782 783 // public only until class moves to o.a.h.h.wal 784 /** @return the size of log files in use */ 785 public long getLogFileSize() { 786 return this.totalLogSize.get(); 787 } 788 789 // public only until class moves to o.a.h.h.wal 790 public void requestLogRoll() { 791 requestLogRoll(false); 792 } 793 794 /** 795 * Get the backing files associated with this WAL. 796 * @return may be null if there are no files. 797 */ 798 @VisibleForTesting 799 FileStatus[] getFiles() throws IOException { 800 return CommonFSUtils.listStatus(fs, walDir, ourFiles); 801 } 802 803 @Override 804 public void shutdown() throws IOException { 805 if (!shutdown.compareAndSet(false, true)) { 806 return; 807 } 808 closed = true; 809 // Tell our listeners that the log is closing 810 if (!this.listeners.isEmpty()) { 811 for (WALActionsListener i : this.listeners) { 812 i.logCloseRequested(); 813 } 814 } 815 rollWriterLock.lock(); 816 try { 817 doShutdown(); 818 } finally { 819 rollWriterLock.unlock(); 820 } 821 } 822 823 @Override 824 public void close() throws IOException { 825 shutdown(); 826 final FileStatus[] files = getFiles(); 827 if (null != files && 0 != files.length) { 828 for (FileStatus file : files) { 829 Path p = getWALArchivePath(this.walArchiveDir, file.getPath()); 830 // Tell our listeners that a log is going to be archived. 831 if (!this.listeners.isEmpty()) { 832 for (WALActionsListener i : this.listeners) { 833 i.preLogArchive(file.getPath(), p); 834 } 835 } 836 837 if (!CommonFSUtils.renameAndSetModifyTime(fs, file.getPath(), p)) { 838 throw new IOException("Unable to rename " + file.getPath() + " to " + p); 839 } 840 // Tell our listeners that a log was archived. 841 if (!this.listeners.isEmpty()) { 842 for (WALActionsListener i : this.listeners) { 843 i.postLogArchive(file.getPath(), p); 844 } 845 } 846 } 847 LOG.debug( 848 "Moved " + files.length + " WAL file(s) to " + CommonFSUtils.getPath(this.walArchiveDir)); 849 } 850 LOG.info("Closed WAL: " + toString()); 851 } 852 853 /** 854 * updates the sequence number of a specific store. depending on the flag: replaces current seq 855 * number if the given seq id is bigger, or even if it is lower than existing one 856 * @param encodedRegionName 857 * @param familyName 858 * @param sequenceid 859 * @param onlyIfGreater 860 */ 861 @Override 862 public void updateStore(byte[] encodedRegionName, byte[] familyName, Long sequenceid, 863 boolean onlyIfGreater) { 864 sequenceIdAccounting.updateStore(encodedRegionName, familyName, sequenceid, onlyIfGreater); 865 } 866 867 protected final SyncFuture getSyncFuture(long sequence) { 868 return cachedSyncFutures.get().reset(sequence); 869 } 870 871 protected final void requestLogRoll(boolean tooFewReplicas) { 872 if (!this.listeners.isEmpty()) { 873 for (WALActionsListener i : this.listeners) { 874 i.logRollRequested(tooFewReplicas); 875 } 876 } 877 } 878 879 long getUnflushedEntriesCount() { 880 long highestSynced = this.highestSyncedTxid.get(); 881 long highestUnsynced = this.highestUnsyncedTxid; 882 return highestSynced >= highestUnsynced ? 0 : highestUnsynced - highestSynced; 883 } 884 885 boolean isUnflushedEntries() { 886 return getUnflushedEntriesCount() > 0; 887 } 888 889 /** 890 * Exposed for testing only. Use to tricks like halt the ring buffer appending. 891 */ 892 @VisibleForTesting 893 void atHeadOfRingBufferEventHandlerAppend() { 894 // Noop 895 } 896 897 protected final boolean append(W writer, FSWALEntry entry) throws IOException { 898 // TODO: WORK ON MAKING THIS APPEND FASTER. DOING WAY TOO MUCH WORK WITH CPs, PBing, etc. 899 atHeadOfRingBufferEventHandlerAppend(); 900 long start = EnvironmentEdgeManager.currentTime(); 901 byte[] encodedRegionName = entry.getKey().getEncodedRegionName(); 902 long regionSequenceId = entry.getKey().getSequenceId(); 903 904 // Edits are empty, there is nothing to append. Maybe empty when we are looking for a 905 // region sequence id only, a region edit/sequence id that is not associated with an actual 906 // edit. It has to go through all the rigmarole to be sure we have the right ordering. 907 if (entry.getEdit().isEmpty()) { 908 return false; 909 } 910 911 // Coprocessor hook. 912 coprocessorHost.preWALWrite(entry.getRegionInfo(), entry.getKey(), entry.getEdit()); 913 if (!listeners.isEmpty()) { 914 for (WALActionsListener i : listeners) { 915 i.visitLogEntryBeforeWrite(entry.getKey(), entry.getEdit()); 916 } 917 } 918 doAppend(writer, entry); 919 assert highestUnsyncedTxid < entry.getTxid(); 920 highestUnsyncedTxid = entry.getTxid(); 921 sequenceIdAccounting.update(encodedRegionName, entry.getFamilyNames(), regionSequenceId, 922 entry.isInMemStore()); 923 coprocessorHost.postWALWrite(entry.getRegionInfo(), entry.getKey(), entry.getEdit()); 924 // Update metrics. 925 postAppend(entry, EnvironmentEdgeManager.currentTime() - start); 926 numEntries.incrementAndGet(); 927 return true; 928 } 929 930 private long postAppend(final Entry e, final long elapsedTime) throws IOException { 931 long len = 0; 932 if (!listeners.isEmpty()) { 933 for (Cell cell : e.getEdit().getCells()) { 934 len += PrivateCellUtil.estimatedSerializedSizeOf(cell); 935 } 936 for (WALActionsListener listener : listeners) { 937 listener.postAppend(len, elapsedTime, e.getKey(), e.getEdit()); 938 } 939 } 940 return len; 941 } 942 943 protected final void postSync(final long timeInNanos, final int handlerSyncs) { 944 if (timeInNanos > this.slowSyncNs) { 945 String msg = new StringBuilder().append("Slow sync cost: ").append(timeInNanos / 1000000) 946 .append(" ms, current pipeline: ").append(Arrays.toString(getPipeline())).toString(); 947 TraceUtil.addTimelineAnnotation(msg); 948 LOG.info(msg); 949 } 950 if (!listeners.isEmpty()) { 951 for (WALActionsListener listener : listeners) { 952 listener.postSync(timeInNanos, handlerSyncs); 953 } 954 } 955 } 956 957 protected final long stampSequenceIdAndPublishToRingBuffer(RegionInfo hri, WALKeyImpl key, 958 WALEdit edits, boolean inMemstore, RingBuffer<RingBufferTruck> ringBuffer) 959 throws IOException { 960 if (this.closed) { 961 throw new IOException( 962 "Cannot append; log is closed, regionName = " + hri.getRegionNameAsString()); 963 } 964 MutableLong txidHolder = new MutableLong(); 965 MultiVersionConcurrencyControl.WriteEntry we = key.getMvcc().begin(() -> { 966 txidHolder.setValue(ringBuffer.next()); 967 }); 968 long txid = txidHolder.longValue(); 969 try (TraceScope scope = TraceUtil.createTrace(implClassName + ".append")) { 970 FSWALEntry entry = new FSWALEntry(txid, key, edits, hri, inMemstore); 971 entry.stampRegionSequenceId(we); 972 ringBuffer.get(txid).load(entry); 973 } finally { 974 ringBuffer.publish(txid); 975 } 976 return txid; 977 } 978 979 @Override 980 public String toString() { 981 return implClassName + " " + walFilePrefix + ":" + walFileSuffix + "(num " + filenum + ")"; 982 } 983 984 /** 985 * if the given {@code path} is being written currently, then return its length. 986 * <p> 987 * This is used by replication to prevent replicating unacked log entries. See 988 * https://issues.apache.org/jira/browse/HBASE-14004 for more details. 989 */ 990 @Override 991 public OptionalLong getLogFileSizeIfBeingWritten(Path path) { 992 rollWriterLock.lock(); 993 try { 994 Path currentPath = getOldPath(); 995 if (path.equals(currentPath)) { 996 W writer = this.writer; 997 return writer != null ? OptionalLong.of(writer.getLength()) : OptionalLong.empty(); 998 } else { 999 return OptionalLong.empty(); 1000 } 1001 } finally { 1002 rollWriterLock.unlock(); 1003 } 1004 } 1005 1006 /** 1007 * NOTE: This append, at a time that is usually after this call returns, starts an mvcc 1008 * transaction by calling 'begin' wherein which we assign this update a sequenceid. At assignment 1009 * time, we stamp all the passed in Cells inside WALEdit with their sequenceId. You must 1010 * 'complete' the transaction this mvcc transaction by calling 1011 * MultiVersionConcurrencyControl#complete(...) or a variant otherwise mvcc will get stuck. Do it 1012 * in the finally of a try/finally block within which this append lives and any subsequent 1013 * operations like sync or update of memstore, etc. Get the WriteEntry to pass mvcc out of the 1014 * passed in WALKey <code>walKey</code> parameter. Be warned that the WriteEntry is not 1015 * immediately available on return from this method. It WILL be available subsequent to a sync of 1016 * this append; otherwise, you will just have to wait on the WriteEntry to get filled in. 1017 */ 1018 @Override 1019 public abstract long append(RegionInfo info, WALKeyImpl key, WALEdit edits, boolean inMemstore) 1020 throws IOException; 1021 1022 protected abstract void doAppend(W writer, FSWALEntry entry) throws IOException; 1023 1024 protected abstract W createWriterInstance(Path path) 1025 throws IOException, CommonFSUtils.StreamLacksCapabilityException; 1026 1027 protected abstract void doReplaceWriter(Path oldPath, Path newPath, W nextWriter) 1028 throws IOException; 1029 1030 protected abstract void doShutdown() throws IOException; 1031 1032 protected abstract boolean doCheckLogLowReplication(); 1033 1034 public void checkLogLowReplication(long checkInterval) { 1035 long now = EnvironmentEdgeManager.currentTime(); 1036 if (now - lastTimeCheckLowReplication < checkInterval) { 1037 return; 1038 } 1039 // Will return immediately if we are in the middle of a WAL log roll currently. 1040 if (!rollWriterLock.tryLock()) { 1041 return; 1042 } 1043 try { 1044 lastTimeCheckLowReplication = now; 1045 if (doCheckLogLowReplication()) { 1046 requestLogRoll(true); 1047 } 1048 } finally { 1049 rollWriterLock.unlock(); 1050 } 1051 } 1052 1053 /** 1054 * This method gets the pipeline for the current WAL. 1055 */ 1056 @VisibleForTesting 1057 abstract DatanodeInfo[] getPipeline(); 1058 1059 /** 1060 * This method gets the datanode replication count for the current WAL. 1061 */ 1062 @VisibleForTesting 1063 abstract int getLogReplication(); 1064 1065 private static void split(final Configuration conf, final Path p) throws IOException { 1066 FileSystem fs = FSUtils.getWALFileSystem(conf); 1067 if (!fs.exists(p)) { 1068 throw new FileNotFoundException(p.toString()); 1069 } 1070 if (!fs.getFileStatus(p).isDirectory()) { 1071 throw new IOException(p + " is not a directory"); 1072 } 1073 1074 final Path baseDir = FSUtils.getWALRootDir(conf); 1075 Path archiveDir = new Path(baseDir, HConstants.HREGION_OLDLOGDIR_NAME); 1076 if (conf.getBoolean(AbstractFSWALProvider.SEPARATE_OLDLOGDIR, 1077 AbstractFSWALProvider.DEFAULT_SEPARATE_OLDLOGDIR)) { 1078 archiveDir = new Path(archiveDir, p.getName()); 1079 } 1080 WALSplitter.split(baseDir, p, archiveDir, fs, conf, WALFactory.getInstance(conf)); 1081 } 1082 1083 private static void usage() { 1084 System.err.println("Usage: AbstractFSWAL <ARGS>"); 1085 System.err.println("Arguments:"); 1086 System.err.println(" --dump Dump textual representation of passed one or more files"); 1087 System.err.println(" For example: " + 1088 "AbstractFSWAL --dump hdfs://example.com:9000/hbase/WALs/MACHINE/LOGFILE"); 1089 System.err.println(" --split Split the passed directory of WAL logs"); 1090 System.err.println( 1091 " For example: AbstractFSWAL --split hdfs://example.com:9000/hbase/WALs/DIR"); 1092 } 1093 1094 /** 1095 * Pass one or more log file names and it will either dump out a text version on 1096 * <code>stdout</code> or split the specified log files. 1097 */ 1098 public static void main(String[] args) throws IOException { 1099 if (args.length < 2) { 1100 usage(); 1101 System.exit(-1); 1102 } 1103 // either dump using the WALPrettyPrinter or split, depending on args 1104 if (args[0].compareTo("--dump") == 0) { 1105 WALPrettyPrinter.run(Arrays.copyOfRange(args, 1, args.length)); 1106 } else if (args[0].compareTo("--perf") == 0) { 1107 LOG.error(HBaseMarkers.FATAL, "Please use the WALPerformanceEvaluation tool instead. i.e.:"); 1108 LOG.error(HBaseMarkers.FATAL, 1109 "\thbase org.apache.hadoop.hbase.wal.WALPerformanceEvaluation --iterations " + args[1]); 1110 System.exit(-1); 1111 } else if (args[0].compareTo("--split") == 0) { 1112 Configuration conf = HBaseConfiguration.create(); 1113 for (int i = 1; i < args.length; i++) { 1114 try { 1115 Path logPath = new Path(args[i]); 1116 FSUtils.setFsDefault(conf, logPath); 1117 split(conf, logPath); 1118 } catch (IOException t) { 1119 t.printStackTrace(System.err); 1120 System.exit(-1); 1121 } 1122 } 1123 } else { 1124 usage(); 1125 System.exit(-1); 1126 } 1127 } 1128}