001/**
002 *
003 * Licensed to the Apache Software Foundation (ASF) under one
004 * or more contributor license agreements.  See the NOTICE file
005 * distributed with this work for additional information
006 * regarding copyright ownership.  The ASF licenses this file
007 * to you under the Apache License, Version 2.0 (the
008 * "License"); you may not use this file except in compliance
009 * with the License.  You may obtain a copy of the License at
010 *
011 *     http://www.apache.org/licenses/LICENSE-2.0
012 *
013 * Unless required by applicable law or agreed to in writing, software
014 * distributed under the License is distributed on an "AS IS" BASIS,
015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016 * See the License for the specific language governing permissions and
017 * limitations under the License.
018 */
019package org.apache.hadoop.hbase.master;
020
021import java.io.IOException;
022import java.util.ArrayList;
023import java.util.Collections;
024import java.util.HashSet;
025import java.util.List;
026import java.util.Set;
027import java.util.concurrent.locks.Lock;
028import java.util.concurrent.locks.ReentrantLock;
029import java.util.stream.Collectors;
030import java.util.stream.Stream;
031import org.apache.hadoop.conf.Configuration;
032import org.apache.hadoop.fs.FileStatus;
033import org.apache.hadoop.fs.FileSystem;
034import org.apache.hadoop.fs.Path;
035import org.apache.hadoop.fs.PathFilter;
036import org.apache.hadoop.hbase.HConstants;
037import org.apache.hadoop.hbase.ServerName;
038import org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL;
039import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
040import org.apache.hadoop.hbase.util.FSUtils;
041import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
042import org.apache.hadoop.hbase.wal.WALSplitter;
043import org.apache.yetus.audience.InterfaceAudience;
044import org.slf4j.Logger;
045import org.slf4j.LoggerFactory;
046import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
047
048/**
049 * This class abstracts a bunch of operations the HMaster needs
050 * when splitting log files e.g. finding log files, dirs etc.
051 */
052@InterfaceAudience.Private
053public class MasterWalManager {
054  private static final Logger LOG = LoggerFactory.getLogger(MasterWalManager.class);
055
056  final static PathFilter META_FILTER = new PathFilter() {
057    @Override
058    public boolean accept(Path p) {
059      return AbstractFSWALProvider.isMetaFile(p);
060    }
061  };
062
063  final static PathFilter NON_META_FILTER = new PathFilter() {
064    @Override
065    public boolean accept(Path p) {
066      return !AbstractFSWALProvider.isMetaFile(p);
067    }
068  };
069
070  // metrics for master
071  // TODO: Rename it, since those metrics are split-manager related
072  private final MetricsMasterFileSystem metricsMasterFilesystem = new MetricsMasterFileSystem();
073
074  // Keep around for convenience.
075  private final MasterServices services;
076  private final Configuration conf;
077  private final FileSystem fs;
078
079  // The Path to the old logs dir
080  private final Path oldLogDir;
081
082  /**
083   * This is the hbase rootdir.
084   * We'll put the WALs under this dir.
085   */
086  private final Path rootDir;
087
088  // create the split log lock
089  private final Lock splitLogLock = new ReentrantLock();
090  private final SplitLogManager splitLogManager;
091
092  // Is the fileystem ok?
093  private volatile boolean fsOk = true;
094
095  public MasterWalManager(MasterServices services) throws IOException {
096    this(services.getConfiguration(), services.getMasterFileSystem().getWALFileSystem(),
097      services.getMasterFileSystem().getWALRootDir(), services);
098  }
099
100  public MasterWalManager(Configuration conf, FileSystem fs, Path rootDir, MasterServices services)
101      throws IOException {
102    this.fs = fs;
103    this.conf = conf;
104    this.rootDir = rootDir;
105    this.services = services;
106    this.splitLogManager = new SplitLogManager(services, conf);
107
108    this.oldLogDir = new Path(rootDir, HConstants.HREGION_OLDLOGDIR_NAME);
109  }
110
111  public void stop() {
112    if (splitLogManager != null) {
113      splitLogManager.stop();
114    }
115  }
116
117  @VisibleForTesting
118  SplitLogManager getSplitLogManager() {
119    return this.splitLogManager;
120  }
121
122  /**
123   * Get the directory where old logs go
124   * @return the dir
125   */
126  Path getOldLogDir() {
127    return this.oldLogDir;
128  }
129
130  public FileSystem getFileSystem() {
131    return this.fs;
132  }
133
134  /**
135   * Checks to see if the file system is still accessible.
136   * If not, sets closed
137   * @return false if file system is not available
138   */
139  private boolean checkFileSystem() {
140    if (this.fsOk) {
141      try {
142        FSUtils.checkFileSystemAvailable(this.fs);
143        FSUtils.checkDfsSafeMode(this.conf);
144      } catch (IOException e) {
145        services.abort("Shutting down HBase cluster: file system not available", e);
146        this.fsOk = false;
147      }
148    }
149    return this.fsOk;
150  }
151
152  /**
153   * Get Servernames which are currently splitting; paths have a '-splitting' suffix.
154   * @return ServerName
155   * @throws IOException IOException
156   */
157  public Set<ServerName> getSplittingServersFromWALDir() throws  IOException {
158    return getServerNamesFromWALDirPath(
159      p -> p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT));
160  }
161
162  /**
163   * Get Servernames that COULD BE 'alive'; excludes those that have a '-splitting' suffix as these
164   * are already being split -- they cannot be 'alive'.
165   * @return ServerName
166   * @throws IOException IOException
167   */
168  public Set<ServerName> getLiveServersFromWALDir() throws IOException {
169    return getServerNamesFromWALDirPath(
170      p -> !p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT));
171  }
172
173  /**
174   * @return listing of ServerNames found by parsing WAL directory paths in FS.
175   *
176   */
177  public Set<ServerName> getServerNamesFromWALDirPath(final PathFilter filter) throws IOException {
178    FileStatus[] walDirForServerNames = getWALDirPaths(filter);
179    return Stream.of(walDirForServerNames).map(s -> {
180      ServerName serverName = AbstractFSWALProvider.getServerNameFromWALDirectoryName(s.getPath());
181      if (serverName == null) {
182        LOG.warn("Log folder {} doesn't look like its name includes a " +
183          "region server name; leaving in place. If you see later errors about missing " +
184          "write ahead logs they may be saved in this location.", s.getPath());
185        return null;
186      }
187      return serverName;
188    }).filter(s -> s != null).collect(Collectors.toSet());
189  }
190
191  /**
192   * @return Returns the WALs dir under <code>rootDir</code>
193   */
194  Path getWALDirPath() {
195    return new Path(this.rootDir, HConstants.HREGION_LOGDIR_NAME);
196  }
197
198  /**
199   * @return List of all RegionServer WAL dirs; i.e. this.rootDir/HConstants.HREGION_LOGDIR_NAME.
200   */
201  public FileStatus[] getWALDirPaths(final PathFilter filter) throws IOException {
202    Path walDirPath = getWALDirPath();
203    FileStatus[] walDirForServerNames = FSUtils.listStatus(fs, walDirPath, filter);
204    return walDirForServerNames == null? new FileStatus[0]: walDirForServerNames;
205  }
206
207  /**
208   * Inspect the log directory to find dead servers which need recovery work
209   * @return A set of ServerNames which aren't running but still have WAL files left in file system
210   * @deprecated With proc-v2, we can record the crash server with procedure store, so do not need
211   *             to scan the wal directory to find out the splitting wal directory any more. Leave
212   *             it here only because {@code RecoverMetaProcedure}(which is also deprecated) uses
213   *             it.
214   */
215  @Deprecated
216  public Set<ServerName> getFailedServersFromLogFolders() {
217    boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors",
218        WALSplitter.SPLIT_SKIP_ERRORS_DEFAULT);
219
220    Set<ServerName> serverNames = new HashSet<>();
221    Path logsDirPath = getWALDirPath();
222
223    do {
224      if (services.isStopped()) {
225        LOG.warn("Master stopped while trying to get failed servers.");
226        break;
227      }
228      try {
229        if (!this.fs.exists(logsDirPath)) return serverNames;
230        FileStatus[] logFolders = FSUtils.listStatus(this.fs, logsDirPath, null);
231        // Get online servers after getting log folders to avoid log folder deletion of newly
232        // checked in region servers . see HBASE-5916
233        Set<ServerName> onlineServers = services.getServerManager().getOnlineServers().keySet();
234
235        if (logFolders == null || logFolders.length == 0) {
236          LOG.debug("No log files to split, proceeding...");
237          return serverNames;
238        }
239        for (FileStatus status : logFolders) {
240          FileStatus[] curLogFiles = FSUtils.listStatus(this.fs, status.getPath(), null);
241          if (curLogFiles == null || curLogFiles.length == 0) {
242            // Empty log folder. No recovery needed
243            continue;
244          }
245          final ServerName serverName = AbstractFSWALProvider.getServerNameFromWALDirectoryName(
246              status.getPath());
247          if (null == serverName) {
248            LOG.warn("Log folder " + status.getPath() + " doesn't look like its name includes a " +
249                "region server name; leaving in place. If you see later errors about missing " +
250                "write ahead logs they may be saved in this location.");
251          } else if (!onlineServers.contains(serverName)) {
252            LOG.info("Log folder " + status.getPath() + " doesn't belong "
253                + "to a known region server, splitting");
254            serverNames.add(serverName);
255          } else {
256            LOG.info("Log folder " + status.getPath() + " belongs to an existing region server");
257          }
258        }
259        retrySplitting = false;
260      } catch (IOException ioe) {
261        LOG.warn("Failed getting failed servers to be recovered.", ioe);
262        if (!checkFileSystem()) {
263          LOG.warn("Bad Filesystem, exiting");
264          Runtime.getRuntime().halt(1);
265        }
266        try {
267          if (retrySplitting) {
268            Thread.sleep(conf.getInt("hbase.hlog.split.failure.retry.interval", 30 * 1000));
269          }
270        } catch (InterruptedException e) {
271          LOG.warn("Interrupted, aborting since cannot return w/o splitting");
272          Thread.currentThread().interrupt();
273          retrySplitting = false;
274          Runtime.getRuntime().halt(1);
275        }
276      }
277    } while (retrySplitting);
278
279    return serverNames;
280  }
281
282  public void splitLog(final ServerName serverName) throws IOException {
283    splitLog(Collections.<ServerName>singleton(serverName));
284  }
285
286  /**
287   * Specialized method to handle the splitting for meta WAL
288   * @param serverName logs belonging to this server will be split
289   */
290  public void splitMetaLog(final ServerName serverName) throws IOException {
291    splitMetaLog(Collections.<ServerName>singleton(serverName));
292  }
293
294  /**
295   * Specialized method to handle the splitting for meta WAL
296   * @param serverNames logs belonging to these servers will be split
297   */
298  public void splitMetaLog(final Set<ServerName> serverNames) throws IOException {
299    splitLog(serverNames, META_FILTER);
300  }
301
302  /**
303   * @return True if a WAL directory exists (will return true also if WALs found in
304   *   servername'-splitting' too).
305   */
306  boolean isWALDirectoryNameWithWALs(ServerName serverName) {
307    FileStatus [] fss = null;
308    try {
309      // 'startsWith' will also return dirs ending in AbstractFSWALProvider.SPLITTING_EXT
310      fss = getWALDirPaths(p -> p.getName().startsWith(serverName.toString()));
311    } catch (IOException ioe) {
312      LOG.warn("{}", serverName, ioe);
313      // Something wrong reading from fs. Returning 'true' to bring on more fs activity
314      return true;
315    }
316    if (fss != null) {
317      for (FileStatus fileStatus: fss) {
318        if (fileStatus.isDirectory()) {
319          // Not testing for existence; presuming exists if we got it out of getWALDirPaths
320          // listing. I used to test for presence of WAL and return false if empty but it can be
321          // empty if a clean shutdown. Even clean shutdowns need to be recovered so the meta
322          // and namespace assigns get triggered.
323          return true;
324        }
325      }
326    }
327    return false;
328  }
329
330  /**
331   * Depends on current FS Layout!
332   * @return The Path to the WAL directory for <code>serverName</code>
333   */
334  Path getWALDirectoryName(ServerName serverName) {
335    return new Path(this.rootDir, AbstractFSWALProvider.getWALDirectoryName(serverName.toString()));
336  }
337
338  /**
339   * Finds WAL dirs for <code>serverNames</code> and renames them with '-splitting' suffix.
340   * @return List of '-splitting' directories that pertain to <code>serverNames</code>
341   */
342  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UL_UNRELEASED_LOCK", justification=
343      "We only release this lock when we set it. Updates to code that uses it should verify use " +
344      "of the guard boolean.")
345  List<Path> createAndGetLogDirs(final Set<ServerName> serverNames) throws IOException {
346    List<Path> logDirs = new ArrayList<>();
347    boolean needReleaseLock = false;
348    if (!this.services.isInitialized()) {
349      // during master initialization, we could have multiple places splitting a same wal
350      // XXX: Does this still exist after we move to proc-v2?
351      this.splitLogLock.lock();
352      needReleaseLock = true;
353    }
354    try {
355      for (ServerName serverName : serverNames) {
356        Path logDir = getWALDirectoryName(serverName);
357        // This adds the -splitting suffix to logDir.
358        Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT);
359        // Rename the directory so a rogue RS doesn't create more WALs
360        if (fs.exists(logDir)) {
361          if (!this.fs.rename(logDir, splitDir)) {
362            throw new IOException("Failed fs.rename for log split: " + logDir);
363          }
364          logDir = splitDir;
365          LOG.debug("Renamed region directory: " + splitDir);
366        } else if (!fs.exists(splitDir)) {
367          LOG.info("Log dir for server " + serverName + " does not exist");
368          continue;
369        }
370        logDirs.add(splitDir);
371      }
372    } catch (IOException ioe) {
373      if (!checkFileSystem()) {
374        this.services.abort("Aborting due to filesystem unavailable", ioe);
375        throw ioe;
376      }
377    } finally {
378      if (needReleaseLock) {
379        this.splitLogLock.unlock();
380      }
381    }
382    return logDirs;
383  }
384
385  public void splitLog(final Set<ServerName> serverNames) throws IOException {
386    splitLog(serverNames, NON_META_FILTER);
387  }
388
389  /**
390   * This method is the base split method that splits WAL files matching a filter. Callers should
391   * pass the appropriate filter for meta and non-meta WALs.
392   * @param serverNames logs belonging to these servers will be split; this will rename the log
393   *                    directory out from under a soft-failed server
394   */
395  public void splitLog(final Set<ServerName> serverNames, PathFilter filter) throws IOException {
396    long splitTime = 0, splitLogSize = 0;
397    List<Path> logDirs = createAndGetLogDirs(serverNames);
398
399    splitLogManager.handleDeadWorkers(serverNames);
400    splitTime = EnvironmentEdgeManager.currentTime();
401    splitLogSize = splitLogManager.splitLogDistributed(serverNames, logDirs, filter);
402    splitTime = EnvironmentEdgeManager.currentTime() - splitTime;
403
404    if (this.metricsMasterFilesystem != null) {
405      if (filter == META_FILTER) {
406        this.metricsMasterFilesystem.addMetaWALSplit(splitTime, splitLogSize);
407      } else {
408        this.metricsMasterFilesystem.addSplit(splitTime, splitLogSize);
409      }
410    }
411  }
412
413  /**
414   * For meta region open and closed normally on a server, it may leave some meta
415   * WAL in the server's wal dir. Since meta region is no long on this server,
416   * The SCP won't split those meta wals, just leaving them there. So deleting
417   * the wal dir will fail since the dir is not empty. Actually We can safely achive those
418   * meta log and Archiving the meta log and delete the dir.
419   * @param serverName the server to archive meta log
420   */
421  public void archiveMetaLog(final ServerName serverName) {
422    try {
423      Path logDir = new Path(this.rootDir,
424          AbstractFSWALProvider.getWALDirectoryName(serverName.toString()));
425      Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT);
426      if (fs.exists(splitDir)) {
427        FileStatus[] logfiles = FSUtils.listStatus(fs, splitDir, META_FILTER);
428        if (logfiles != null) {
429          for (FileStatus status : logfiles) {
430            if (!status.isDir()) {
431              Path newPath = AbstractFSWAL.getWALArchivePath(this.oldLogDir,
432                  status.getPath());
433              if (!FSUtils.renameAndSetModifyTime(fs, status.getPath(), newPath)) {
434                LOG.warn("Unable to move  " + status.getPath() + " to " + newPath);
435              } else {
436                LOG.debug("Archived meta log " + status.getPath() + " to " + newPath);
437              }
438            }
439          }
440        }
441        if (!fs.delete(splitDir, false)) {
442          LOG.warn("Unable to delete log dir. Ignoring. " + splitDir);
443        }
444      }
445    } catch (IOException ie) {
446      LOG.warn("Failed archiving meta log for server " + serverName, ie);
447    }
448  }
449
450
451}