001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase;
019
020import java.io.Closeable;
021import java.io.IOException;
022import org.apache.hadoop.conf.Configurable;
023import org.apache.hadoop.conf.Configuration;
024import org.apache.hadoop.hbase.client.RegionInfoBuilder;
025import org.apache.hadoop.hbase.util.Threads;
026import org.apache.yetus.audience.InterfaceAudience;
027import org.slf4j.Logger;
028import org.slf4j.LoggerFactory;
029
030import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.AdminService;
031import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos.ClientService;
032import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.MasterService;
033
034/**
035 * This class defines methods that can help with managing HBase clusters
036 * from unit tests and system tests. There are 3 types of cluster deployments:
037 * <ul>
038 * <li><b>MiniHBaseCluster:</b> each server is run in the same JVM in separate threads,
039 * used by unit tests</li>
040 * <li><b>DistributedHBaseCluster:</b> the cluster is pre-deployed, system and integration tests can
041 * interact with the cluster. </li>
042 * <li><b>ProcessBasedLocalHBaseCluster:</b> each server is deployed locally but in separate
043 * JVMs. </li>
044 * </ul>
045 * <p>
046 * HBaseCluster unifies the way tests interact with the cluster, so that the same test can
047 * be run against a mini-cluster during unit test execution, or a distributed cluster having
048 * tens/hundreds of nodes during execution of integration tests.
049 *
050 * <p>
051 * HBaseCluster exposes client-side public interfaces to tests, so that tests does not assume
052 * running in a particular mode. Not all the tests are suitable to be run on an actual cluster,
053 * and some tests will still need to mock stuff and introspect internal state. For those use
054 * cases from unit tests, or if more control is needed, you can use the subclasses directly.
055 * In that sense, this class does not abstract away <strong>every</strong> interface that
056 * MiniHBaseCluster or DistributedHBaseCluster provide.
057 */
058@InterfaceAudience.Private
059public abstract class HBaseCluster implements Closeable, Configurable {
060  // Log is being used in DistributedHBaseCluster class, hence keeping it as package scope
061  static final Logger LOG = LoggerFactory.getLogger(HBaseCluster.class.getName());
062  protected Configuration conf;
063
064  /** the status of the cluster before we begin */
065  protected ClusterMetrics initialClusterStatus;
066
067  /**
068   * Construct an HBaseCluster
069   * @param conf Configuration to be used for cluster
070   */
071  public HBaseCluster(Configuration conf) {
072    setConf(conf);
073  }
074
075  @Override
076  public void setConf(Configuration conf) {
077    this.conf = conf;
078  }
079
080  @Override
081  public Configuration getConf() {
082    return conf;
083  }
084
085  /**
086   * Returns a ClusterMetrics for this HBase cluster.
087   * @see #getInitialClusterMetrics()
088   */
089  public abstract ClusterMetrics getClusterMetrics() throws IOException;
090
091  /**
092   * Returns a ClusterStatus for this HBase cluster as observed at the
093   * starting of the HBaseCluster
094   */
095  public ClusterMetrics getInitialClusterMetrics() throws IOException {
096    return initialClusterStatus;
097  }
098
099  /**
100   * Returns an {@link MasterService.BlockingInterface} to the active master
101   */
102  public abstract MasterService.BlockingInterface getMasterAdminService()
103  throws IOException;
104
105  /**
106   * Returns an AdminProtocol interface to the regionserver
107   */
108  public abstract AdminService.BlockingInterface getAdminProtocol(ServerName serverName)
109  throws IOException;
110
111  /**
112   * Returns a ClientProtocol interface to the regionserver
113   */
114  public abstract ClientService.BlockingInterface getClientProtocol(ServerName serverName)
115  throws IOException;
116
117  /**
118   * Starts a new region server on the given hostname or if this is a mini/local cluster,
119   * starts a region server locally.
120   * @param hostname the hostname to start the regionserver on
121   * @throws IOException if something goes wrong
122   */
123  public abstract void startRegionServer(String hostname, int port) throws IOException;
124
125  /**
126   * Kills the region server process if this is a distributed cluster, otherwise
127   * this causes the region server to exit doing basic clean up only.
128   * @throws IOException if something goes wrong
129   */
130  public abstract void killRegionServer(ServerName serverName) throws IOException;
131
132  /**
133   * Keeping track of killed servers and being able to check if a particular server was killed makes
134   * it possible to do fault tolerance testing for dead servers in a deterministic way. A concrete
135   * example of such case is - killing servers and waiting for all regions of a particular table
136   * to be assigned. We can check for server column in META table and that its value is not one
137   * of the killed servers.
138   */
139  public abstract boolean isKilledRS(ServerName serverName);
140
141  /**
142   * Stops the given region server, by attempting a gradual stop.
143   * @return whether the operation finished with success
144   * @throws IOException if something goes wrong
145   */
146  public abstract void stopRegionServer(ServerName serverName) throws IOException;
147
148  /**
149   * Wait for the specified region server to join the cluster
150   * @return whether the operation finished with success
151   * @throws IOException if something goes wrong or timeout occurs
152   */
153  public void waitForRegionServerToStart(String hostname, int port, long timeout)
154      throws IOException {
155    long start = System.currentTimeMillis();
156    while ((System.currentTimeMillis() - start) < timeout) {
157      for (ServerName server : getClusterMetrics().getLiveServerMetrics().keySet()) {
158        if (server.getHostname().equals(hostname) && server.getPort() == port) {
159          return;
160        }
161      }
162      Threads.sleep(100);
163    }
164    throw new IOException("did timeout " + timeout + "ms waiting for region server to start: "
165        + hostname);
166  }
167
168  /**
169   * Wait for the specified region server to stop the thread / process.
170   * @return whether the operation finished with success
171   * @throws IOException if something goes wrong or timeout occurs
172   */
173  public abstract void waitForRegionServerToStop(ServerName serverName, long timeout)
174      throws IOException;
175
176  /**
177   * Starts a new zookeeper node on the given hostname or if this is a mini/local cluster,
178   * silently logs warning message.
179   * @param hostname the hostname to start the regionserver on
180   * @throws IOException if something goes wrong
181   */
182  public abstract void startZkNode(String hostname, int port) throws IOException;
183
184  /**
185   * Kills the zookeeper node process if this is a distributed cluster, otherwise,
186   * this causes master to exit doing basic clean up only.
187   * @throws IOException if something goes wrong
188   */
189  public abstract void killZkNode(ServerName serverName) throws IOException;
190
191  /**
192   * Stops the region zookeeper if this is a distributed cluster, otherwise
193   * silently logs warning message.
194   * @throws IOException if something goes wrong
195   */
196  public abstract void stopZkNode(ServerName serverName) throws IOException;
197
198  /**
199   * Wait for the specified zookeeper node to join the cluster
200   * @return whether the operation finished with success
201   * @throws IOException if something goes wrong or timeout occurs
202   */
203  public abstract void waitForZkNodeToStart(ServerName serverName, long timeout)
204    throws IOException;
205
206  /**
207   * Wait for the specified zookeeper node to stop the thread / process.
208   * @return whether the operation finished with success
209   * @throws IOException if something goes wrong or timeout occurs
210   */
211  public abstract void waitForZkNodeToStop(ServerName serverName, long timeout)
212    throws IOException;
213
214  /**
215   * Starts a new datanode on the given hostname or if this is a mini/local cluster,
216   * silently logs warning message.
217   * @throws IOException if something goes wrong
218   */
219  public abstract void startDataNode(ServerName serverName) throws IOException;
220
221  /**
222   * Kills the datanode process if this is a distributed cluster, otherwise,
223   * this causes master to exit doing basic clean up only.
224   * @throws IOException if something goes wrong
225   */
226  public abstract void killDataNode(ServerName serverName) throws IOException;
227
228  /**
229   * Stops the datanode if this is a distributed cluster, otherwise
230   * silently logs warning message.
231   * @throws IOException if something goes wrong
232   */
233  public abstract void stopDataNode(ServerName serverName) throws IOException;
234
235  /**
236   * Wait for the specified datanode to join the cluster
237   * @return whether the operation finished with success
238   * @throws IOException if something goes wrong or timeout occurs
239   */
240  public abstract void waitForDataNodeToStart(ServerName serverName, long timeout)
241    throws IOException;
242
243  /**
244   * Wait for the specified datanode to stop the thread / process.
245   * @return whether the operation finished with success
246   * @throws IOException if something goes wrong or timeout occurs
247   */
248  public abstract void waitForDataNodeToStop(ServerName serverName, long timeout)
249    throws IOException;
250
251  /**
252   * Starts a new master on the given hostname or if this is a mini/local cluster,
253   * starts a master locally.
254   * @param hostname the hostname to start the master on
255   * @return whether the operation finished with success
256   * @throws IOException if something goes wrong
257   */
258  public abstract void startMaster(String hostname, int port) throws IOException;
259
260  /**
261   * Kills the master process if this is a distributed cluster, otherwise,
262   * this causes master to exit doing basic clean up only.
263   * @throws IOException if something goes wrong
264   */
265  public abstract void killMaster(ServerName serverName) throws IOException;
266
267  /**
268   * Stops the given master, by attempting a gradual stop.
269   * @throws IOException if something goes wrong
270   */
271  public abstract void stopMaster(ServerName serverName) throws IOException;
272
273  /**
274   * Wait for the specified master to stop the thread / process.
275   * @throws IOException if something goes wrong or timeout occurs
276   */
277  public abstract void waitForMasterToStop(ServerName serverName, long timeout)
278      throws IOException;
279
280  /**
281   * Blocks until there is an active master and that master has completed
282   * initialization.
283   *
284   * @return true if an active master becomes available.  false if there are no
285   *         masters left.
286   * @throws IOException if something goes wrong or timeout occurs
287   */
288  public boolean waitForActiveAndReadyMaster()
289      throws IOException {
290    return waitForActiveAndReadyMaster(Long.MAX_VALUE);
291  }
292
293  /**
294   * Blocks until there is an active master and that master has completed
295   * initialization.
296   * @param timeout the timeout limit in ms
297   * @return true if an active master becomes available.  false if there are no
298   *         masters left.
299   */
300  public abstract boolean waitForActiveAndReadyMaster(long timeout)
301      throws IOException;
302
303  /**
304   * Wait for HBase Cluster to shut down.
305   */
306  public abstract void waitUntilShutDown() throws IOException;
307
308  /**
309   * Shut down the HBase cluster
310   */
311  public abstract void shutdown() throws IOException;
312
313  /**
314   * Restores the cluster to it's initial state if this is a real cluster,
315   * otherwise does nothing.
316   * This is a best effort restore. If the servers are not reachable, or insufficient
317   * permissions, etc. restoration might be partial.
318   * @return whether restoration is complete
319   */
320  public boolean restoreInitialStatus() throws IOException {
321    return restoreClusterMetrics(getInitialClusterMetrics());
322  }
323
324  /**
325   * Restores the cluster to given state if this is a real cluster,
326   * otherwise does nothing.
327   * This is a best effort restore. If the servers are not reachable, or insufficient
328   * permissions, etc. restoration might be partial.
329   * @return whether restoration is complete
330   */
331  public boolean restoreClusterMetrics(ClusterMetrics desiredStatus) throws IOException {
332    return true;
333  }
334
335  /**
336   * Get the ServerName of region server serving the first hbase:meta region
337   */
338  public ServerName getServerHoldingMeta() throws IOException {
339    return getServerHoldingRegion(TableName.META_TABLE_NAME,
340      RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName());
341  }
342
343  /**
344   * Get the ServerName of region server serving the specified region
345   * @param regionName Name of the region in bytes
346   * @param tn Table name that has the region.
347   * @return ServerName that hosts the region or null
348   */
349  public abstract ServerName getServerHoldingRegion(final TableName tn, byte[] regionName)
350      throws IOException;
351
352  /**
353   * @return whether we are interacting with a distributed cluster as opposed to an
354   * in-process mini/local cluster.
355   */
356  public boolean isDistributedCluster() {
357    return false;
358  }
359
360  /**
361   * Closes all the resources held open for this cluster. Note that this call does not shutdown
362   * the cluster.
363   * @see #shutdown()
364   */
365  @Override
366  public abstract void close() throws IOException;
367
368  /**
369   * Wait for the namenode.
370   *
371   * @throws InterruptedException
372   */
373  public void waitForNamenodeAvailable() throws InterruptedException {
374  }
375
376  public void waitForDatanodesRegistered(int nbDN) throws Exception {
377  }
378}