001/**
002 *
003 * Licensed to the Apache Software Foundation (ASF) under one
004 * or more contributor license agreements.  See the NOTICE file
005 * distributed with this work for additional information
006 * regarding copyright ownership.  The ASF licenses this file
007 * to you under the Apache License, Version 2.0 (the
008 * "License"); you may not use this file except in compliance
009 * with the License.  You may obtain a copy of the License at
010 *
011 *     http://www.apache.org/licenses/LICENSE-2.0
012 *
013 * Unless required by applicable law or agreed to in writing, software
014 * distributed under the License is distributed on an "AS IS" BASIS,
015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016 * See the License for the specific language governing permissions and
017 * limitations under the License.
018 */
019package org.apache.hadoop.hbase.mapreduce;
020
021import java.io.Closeable;
022import java.io.IOException;
023import java.net.InetAddress;
024import java.net.InetSocketAddress;
025import java.net.UnknownHostException;
026import java.util.ArrayList;
027import java.util.HashMap;
028import java.util.List;
029import java.util.Map;
030
031import org.apache.yetus.audience.InterfaceAudience;
032import org.slf4j.Logger;
033import org.slf4j.LoggerFactory;
034import org.apache.hadoop.hbase.HConstants;
035import org.apache.hadoop.hbase.HRegionLocation;
036import org.apache.hadoop.hbase.TableName;
037import org.apache.hadoop.hbase.client.Admin;
038import org.apache.hadoop.hbase.client.Connection;
039import org.apache.hadoop.hbase.client.RegionLocator;
040import org.apache.hadoop.hbase.client.Result;
041import org.apache.hadoop.hbase.client.Scan;
042import org.apache.hadoop.hbase.client.Table;
043import org.apache.hadoop.hbase.exceptions.IllegalArgumentIOException;
044import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
045import org.apache.hadoop.hbase.util.Addressing;
046import org.apache.hadoop.hbase.util.Bytes;
047import org.apache.hadoop.hbase.util.Pair;
048import org.apache.hadoop.hbase.util.Strings;
049import org.apache.hadoop.mapreduce.InputFormat;
050import org.apache.hadoop.mapreduce.InputSplit;
051import org.apache.hadoop.mapreduce.JobContext;
052import org.apache.hadoop.mapreduce.RecordReader;
053import org.apache.hadoop.mapreduce.TaskAttemptContext;
054import org.apache.hadoop.net.DNS;
055import org.apache.hadoop.util.StringUtils;
056import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
057
058/**
059 * A base for {@link TableInputFormat}s. Receives a {@link Connection}, a {@link TableName},
060 * an {@link Scan} instance that defines the input columns etc. Subclasses may use
061 * other TableRecordReader implementations.
062 *
063 * Subclasses MUST ensure initializeTable(Connection, TableName) is called for an instance to
064 * function properly. Each of the entry points to this class used by the MapReduce framework,
065 * {@link #createRecordReader(InputSplit, TaskAttemptContext)} and {@link #getSplits(JobContext)},
066 * will call {@link #initialize(JobContext)} as a convenient centralized location to handle
067 * retrieving the necessary configuration information. If your subclass overrides either of these
068 * methods, either call the parent version or call initialize yourself.
069 *
070 * <p>
071 * An example of a subclass:
072 * <pre>
073 *   class ExampleTIF extends TableInputFormatBase {
074 *
075 *     {@literal @}Override
076 *     protected void initialize(JobContext context) throws IOException {
077 *       // We are responsible for the lifecycle of this connection until we hand it over in
078 *       // initializeTable.
079 *       Connection connection = ConnectionFactory.createConnection(HBaseConfiguration.create(
080 *              job.getConfiguration()));
081 *       TableName tableName = TableName.valueOf("exampleTable");
082 *       // mandatory. once passed here, TableInputFormatBase will handle closing the connection.
083 *       initializeTable(connection, tableName);
084 *       byte[][] inputColumns = new byte [][] { Bytes.toBytes("columnA"),
085 *         Bytes.toBytes("columnB") };
086 *       // optional, by default we'll get everything for the table.
087 *       Scan scan = new Scan();
088 *       for (byte[] family : inputColumns) {
089 *         scan.addFamily(family);
090 *       }
091 *       Filter exampleFilter = new RowFilter(CompareOp.EQUAL, new RegexStringComparator("aa.*"));
092 *       scan.setFilter(exampleFilter);
093 *       setScan(scan);
094 *     }
095 *   }
096 * </pre>
097 *
098 *
099 * The number of InputSplits(mappers) match the number of regions in a table by default.
100 * Set "hbase.mapreduce.tableinput.mappers.per.region" to specify how many mappers per region, set
101 * this property will disable autobalance below.\
102 * Set "hbase.mapreduce.tif.input.autobalance" to enable autobalance, hbase will assign mappers
103 * based on average region size; For regions, whose size larger than average region size may assigned
104 * more mappers, and for smaller one, they may group together to use one mapper. If actual average
105 * region size is too big, like 50G, it is not good to only assign 1 mapper for those large regions.
106 * Use "hbase.mapreduce.tif.ave.regionsize" to set max average region size when enable "autobalanece",
107 * default mas average region size is 8G.
108 */
109@InterfaceAudience.Public
110public abstract class TableInputFormatBase
111    extends InputFormat<ImmutableBytesWritable, Result> {
112
113  private static final Logger LOG = LoggerFactory.getLogger(TableInputFormatBase.class);
114
115  private static final String NOT_INITIALIZED = "The input format instance has not been properly " +
116      "initialized. Ensure you call initializeTable either in your constructor or initialize " +
117      "method";
118  private static final String INITIALIZATION_ERROR = "Cannot create a record reader because of a" +
119      " previous error. Please look at the previous logs lines from" +
120      " the task's full log for more details.";
121
122  /** Specify if we enable auto-balance to set number of mappers in M/R jobs. */
123  public static final String MAPREDUCE_INPUT_AUTOBALANCE = "hbase.mapreduce.tif.input.autobalance";
124  /** In auto-balance, we split input by ave region size, if calculated region size is too big, we can set it. */
125  public static final String MAX_AVERAGE_REGION_SIZE = "hbase.mapreduce.tif.ave.regionsize";
126
127  /** Set the number of Mappers for each region, all regions have same number of Mappers */
128  public static final String NUM_MAPPERS_PER_REGION = "hbase.mapreduce.tableinput.mappers.per.region";
129
130
131  /** Holds the details for the internal scanner.
132   *
133   * @see Scan */
134  private Scan scan = null;
135  /** The {@link Admin}. */
136  private Admin admin;
137  /** The {@link Table} to scan. */
138  private Table table;
139  /** The {@link RegionLocator} of the table. */
140  private RegionLocator regionLocator;
141  /** The reader scanning the table, can be a custom one. */
142  private TableRecordReader tableRecordReader = null;
143  /** The underlying {@link Connection} of the table. */
144  private Connection connection;
145
146
147  /** The reverse DNS lookup cache mapping: IPAddress => HostName */
148  private HashMap<InetAddress, String> reverseDNSCacheMap =
149      new HashMap<>();
150
151  /**
152   * Builds a {@link TableRecordReader}. If no {@link TableRecordReader} was provided, uses
153   * the default.
154   *
155   * @param split  The split to work with.
156   * @param context  The current context.
157   * @return The newly created record reader.
158   * @throws IOException When creating the reader fails.
159   * @see org.apache.hadoop.mapreduce.InputFormat#createRecordReader(
160   *   org.apache.hadoop.mapreduce.InputSplit,
161   *   org.apache.hadoop.mapreduce.TaskAttemptContext)
162   */
163  @Override
164  public RecordReader<ImmutableBytesWritable, Result> createRecordReader(
165      InputSplit split, TaskAttemptContext context)
166      throws IOException {
167    // Just in case a subclass is relying on JobConfigurable magic.
168    if (table == null) {
169      initialize(context);
170    }
171    // null check in case our child overrides getTable to not throw.
172    try {
173      if (getTable() == null) {
174        // initialize() must not have been implemented in the subclass.
175        throw new IOException(INITIALIZATION_ERROR);
176      }
177    } catch (IllegalStateException exception) {
178      throw new IOException(INITIALIZATION_ERROR, exception);
179    }
180    TableSplit tSplit = (TableSplit) split;
181    LOG.info("Input split length: " + StringUtils.humanReadableInt(tSplit.getLength()) + " bytes.");
182    final TableRecordReader trr =
183        this.tableRecordReader != null ? this.tableRecordReader : new TableRecordReader();
184    Scan sc = new Scan(this.scan);
185    sc.setStartRow(tSplit.getStartRow());
186    sc.setStopRow(tSplit.getEndRow());
187    trr.setScan(sc);
188    trr.setTable(getTable());
189    return new RecordReader<ImmutableBytesWritable, Result>() {
190
191      @Override
192      public void close() throws IOException {
193        trr.close();
194        closeTable();
195      }
196
197      @Override
198      public ImmutableBytesWritable getCurrentKey() throws IOException, InterruptedException {
199        return trr.getCurrentKey();
200      }
201
202      @Override
203      public Result getCurrentValue() throws IOException, InterruptedException {
204        return trr.getCurrentValue();
205      }
206
207      @Override
208      public float getProgress() throws IOException, InterruptedException {
209        return trr.getProgress();
210      }
211
212      @Override
213      public void initialize(InputSplit inputsplit, TaskAttemptContext context) throws IOException,
214          InterruptedException {
215        trr.initialize(inputsplit, context);
216      }
217
218      @Override
219      public boolean nextKeyValue() throws IOException, InterruptedException {
220        return trr.nextKeyValue();
221      }
222    };
223  }
224
225  protected Pair<byte[][],byte[][]> getStartEndKeys() throws IOException {
226    return getRegionLocator().getStartEndKeys();
227  }
228
229  /**
230   * Calculates the splits that will serve as input for the map tasks.
231   * @param context  The current job context.
232   * @return The list of input splits.
233   * @throws IOException When creating the list of splits fails.
234   * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(
235   *   org.apache.hadoop.mapreduce.JobContext)
236   */
237  @Override
238  public List<InputSplit> getSplits(JobContext context) throws IOException {
239    boolean closeOnFinish = false;
240
241    // Just in case a subclass is relying on JobConfigurable magic.
242    if (table == null) {
243      initialize(context);
244      closeOnFinish = true;
245    }
246
247    // null check in case our child overrides getTable to not throw.
248    try {
249      if (getTable() == null) {
250        // initialize() must not have been implemented in the subclass.
251        throw new IOException(INITIALIZATION_ERROR);
252      }
253    } catch (IllegalStateException exception) {
254      throw new IOException(INITIALIZATION_ERROR, exception);
255    }
256
257    try {
258      List<InputSplit> splits = oneInputSplitPerRegion();
259
260      // set same number of mappers for each region
261      if (context.getConfiguration().get(NUM_MAPPERS_PER_REGION) != null) {
262        int nSplitsPerRegion = context.getConfiguration().getInt(NUM_MAPPERS_PER_REGION, 1);
263        List<InputSplit> res = new ArrayList<>();
264        for (int i = 0; i < splits.size(); i++) {
265          List<InputSplit> tmp = createNInputSplitsUniform(splits.get(i), nSplitsPerRegion);
266          res.addAll(tmp);
267        }
268        return res;
269      }
270
271      //The default value of "hbase.mapreduce.input.autobalance" is false.
272      if (context.getConfiguration().getBoolean(MAPREDUCE_INPUT_AUTOBALANCE, false)) {
273        long maxAveRegionSize = context.getConfiguration()
274            .getLong(MAX_AVERAGE_REGION_SIZE, 8L*1073741824); //8GB
275        return calculateAutoBalancedSplits(splits, maxAveRegionSize);
276      }
277
278      // return one mapper per region
279      return splits;
280    } finally {
281      if (closeOnFinish) {
282        closeTable();
283      }
284    }
285  }
286
287  /**
288   * Create one InputSplit per region
289   *
290   * @return The list of InputSplit for all the regions
291   * @throws IOException
292   */
293  private List<InputSplit> oneInputSplitPerRegion() throws IOException {
294    RegionSizeCalculator sizeCalculator =
295        createRegionSizeCalculator(getRegionLocator(), getAdmin());
296
297    TableName tableName = getTable().getName();
298
299    Pair<byte[][], byte[][]> keys = getStartEndKeys();
300    if (keys == null || keys.getFirst() == null ||
301        keys.getFirst().length == 0) {
302      HRegionLocation regLoc =
303          getRegionLocator().getRegionLocation(HConstants.EMPTY_BYTE_ARRAY, false);
304      if (null == regLoc) {
305        throw new IOException("Expecting at least one region.");
306      }
307      List<InputSplit> splits = new ArrayList<>(1);
308      long regionSize = sizeCalculator.getRegionSize(regLoc.getRegionInfo().getRegionName());
309      TableSplit split = new TableSplit(tableName, scan,
310          HConstants.EMPTY_BYTE_ARRAY, HConstants.EMPTY_BYTE_ARRAY, regLoc
311          .getHostnamePort().split(Addressing.HOSTNAME_PORT_SEPARATOR)[0], regionSize);
312      splits.add(split);
313      return splits;
314    }
315    List<InputSplit> splits = new ArrayList<>(keys.getFirst().length);
316    for (int i = 0; i < keys.getFirst().length; i++) {
317      if (!includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {
318        continue;
319      }
320
321      byte[] startRow = scan.getStartRow();
322      byte[] stopRow = scan.getStopRow();
323      // determine if the given start an stop key fall into the region
324      if ((startRow.length == 0 || keys.getSecond()[i].length == 0 ||
325          Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) &&
326          (stopRow.length == 0 ||
327              Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {
328        byte[] splitStart = startRow.length == 0 ||
329            Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ?
330            keys.getFirst()[i] : startRow;
331        byte[] splitStop = (stopRow.length == 0 ||
332            Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) &&
333            keys.getSecond()[i].length > 0 ?
334            keys.getSecond()[i] : stopRow;
335
336        HRegionLocation location = getRegionLocator().getRegionLocation(keys.getFirst()[i], false);
337        // The below InetSocketAddress creation does a name resolution.
338        InetSocketAddress isa = new InetSocketAddress(location.getHostname(), location.getPort());
339        if (isa.isUnresolved()) {
340          LOG.warn("Failed resolve " + isa);
341        }
342        InetAddress regionAddress = isa.getAddress();
343        String regionLocation;
344        regionLocation = reverseDNS(regionAddress);
345
346        byte[] regionName = location.getRegionInfo().getRegionName();
347        String encodedRegionName = location.getRegionInfo().getEncodedName();
348        long regionSize = sizeCalculator.getRegionSize(regionName);
349        TableSplit split = new TableSplit(tableName, scan,
350            splitStart, splitStop, regionLocation, encodedRegionName, regionSize);
351        splits.add(split);
352        if (LOG.isDebugEnabled()) {
353          LOG.debug("getSplits: split -> " + i + " -> " + split);
354        }
355      }
356    }
357    return splits;
358  }
359
360  /**
361   * Create n splits for one InputSplit, For now only support uniform distribution
362   * @param split A TableSplit corresponding to a range of rowkeys
363   * @param n     Number of ranges after splitting.  Pass 1 means no split for the range
364   *              Pass 2 if you want to split the range in two;
365   * @return A list of TableSplit, the size of the list is n
366   * @throws IllegalArgumentIOException
367   */
368  protected List<InputSplit> createNInputSplitsUniform(InputSplit split, int n)
369      throws IllegalArgumentIOException {
370    if (split == null || !(split instanceof TableSplit)) {
371      throw new IllegalArgumentIOException(
372          "InputSplit for CreateNSplitsPerRegion can not be null + "
373              + "and should be instance of TableSplit");
374    }
375    //if n < 1, then still continue using n = 1
376    n = n < 1 ? 1 : n;
377    List<InputSplit> res = new ArrayList<>(n);
378    if (n == 1) {
379      res.add(split);
380      return res;
381    }
382
383    // Collect Region related information
384    TableSplit ts = (TableSplit) split;
385    TableName tableName = ts.getTable();
386    String regionLocation = ts.getRegionLocation();
387    String encodedRegionName = ts.getEncodedRegionName();
388    long regionSize = ts.getLength();
389    byte[] startRow = ts.getStartRow();
390    byte[] endRow = ts.getEndRow();
391
392    // For special case: startRow or endRow is empty
393    if (startRow.length == 0 && endRow.length == 0){
394      startRow = new byte[1];
395      endRow = new byte[1];
396      startRow[0] = 0;
397      endRow[0] = -1;
398    }
399    if (startRow.length == 0 && endRow.length != 0){
400      startRow = new byte[1];
401      startRow[0] = 0;
402    }
403    if (startRow.length != 0 && endRow.length == 0){
404      endRow =new byte[startRow.length];
405      for (int k = 0; k < startRow.length; k++){
406        endRow[k] = -1;
407      }
408    }
409
410    // Split Region into n chunks evenly
411    byte[][] splitKeys = Bytes.split(startRow, endRow, true, n-1);
412    for (int i = 0; i < splitKeys.length - 1; i++) {
413      //notice that the regionSize parameter may be not very accurate
414      TableSplit tsplit =
415          new TableSplit(tableName, scan, splitKeys[i], splitKeys[i + 1], regionLocation,
416              encodedRegionName, regionSize / n);
417      res.add(tsplit);
418    }
419    return res;
420  }
421  /**
422   * Calculates the number of MapReduce input splits for the map tasks. The number of
423   * MapReduce input splits depends on the average region size.
424   * Make it 'public' for testing
425   *
426   * @param splits The list of input splits before balance.
427   * @param maxAverageRegionSize max Average region size for one mapper
428   * @return The list of input splits.
429   * @throws IOException When creating the list of splits fails.
430   * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(
431   *org.apache.hadoop.mapreduce.JobContext)
432   */
433  public List<InputSplit> calculateAutoBalancedSplits(List<InputSplit> splits, long maxAverageRegionSize)
434      throws IOException {
435    if (splits.size() == 0) {
436      return splits;
437    }
438    List<InputSplit> resultList = new ArrayList<>();
439    long totalRegionSize = 0;
440    for (int i = 0; i < splits.size(); i++) {
441      TableSplit ts = (TableSplit) splits.get(i);
442      totalRegionSize += ts.getLength();
443    }
444    long averageRegionSize = totalRegionSize / splits.size();
445    // totalRegionSize might be overflow, and the averageRegionSize must be positive.
446    if (averageRegionSize <= 0) {
447      LOG.warn("The averageRegionSize is not positive: " + averageRegionSize + ", " +
448          "set it to Long.MAX_VALUE " + splits.size());
449      averageRegionSize = Long.MAX_VALUE / splits.size();
450    }
451    //if averageRegionSize is too big, change it to default as 1 GB,
452    if (averageRegionSize > maxAverageRegionSize) {
453      averageRegionSize = maxAverageRegionSize;
454    }
455    // if averageRegionSize is too small, we do not need to allocate more mappers for those 'large' region
456    // set default as 16M = (default hdfs block size) / 4;
457    if (averageRegionSize < 16 * 1048576) {
458      return splits;
459    }
460    for (int i = 0; i < splits.size(); i++) {
461      TableSplit ts = (TableSplit) splits.get(i);
462      TableName tableName = ts.getTable();
463      String regionLocation = ts.getRegionLocation();
464      String encodedRegionName = ts.getEncodedRegionName();
465      long regionSize = ts.getLength();
466
467      if (regionSize >= averageRegionSize) {
468        // make this region as multiple MapReduce input split.
469        int n = (int) Math.round(Math.log(((double) regionSize) / ((double) averageRegionSize)) + 1.0);
470        List<InputSplit> temp = createNInputSplitsUniform(ts, n);
471        resultList.addAll(temp);
472      } else {
473        // if the total size of several small continuous regions less than the average region size,
474        // combine them into one MapReduce input split.
475        long totalSize = regionSize;
476        byte[] splitStartKey = ts.getStartRow();
477        byte[] splitEndKey = ts.getEndRow();
478        int j = i + 1;
479        while (j < splits.size()) {
480          TableSplit nextRegion = (TableSplit) splits.get(j);
481          long nextRegionSize = nextRegion.getLength();
482          if (totalSize + nextRegionSize <= averageRegionSize
483              && Bytes.equals(splitEndKey, nextRegion.getStartRow())) {
484            totalSize = totalSize + nextRegionSize;
485            splitEndKey = nextRegion.getEndRow();
486            j++;
487          } else {
488            break;
489          }
490        }
491        i = j - 1;
492        TableSplit t = new TableSplit(tableName, scan, splitStartKey, splitEndKey, regionLocation,
493            encodedRegionName, totalSize);
494        resultList.add(t);
495      }
496    }
497    return resultList;
498  }
499
500  String reverseDNS(InetAddress ipAddress) throws UnknownHostException {
501    String hostName = this.reverseDNSCacheMap.get(ipAddress);
502    if (hostName == null) {
503      String ipAddressString = null;
504      try {
505        ipAddressString = DNS.reverseDns(ipAddress, null);
506      } catch (Exception e) {
507        // We can use InetAddress in case the jndi failed to pull up the reverse DNS entry from the
508        // name service. Also, in case of ipv6, we need to use the InetAddress since resolving
509        // reverse DNS using jndi doesn't work well with ipv6 addresses.
510        ipAddressString = InetAddress.getByName(ipAddress.getHostAddress()).getHostName();
511      }
512      if (ipAddressString == null) throw new UnknownHostException("No host found for " + ipAddress);
513      hostName = Strings.domainNamePointerToHostName(ipAddressString);
514      this.reverseDNSCacheMap.put(ipAddress, hostName);
515    }
516    return hostName;
517  }
518
519  /**
520   * Test if the given region is to be included in the InputSplit while splitting
521   * the regions of a table.
522   * <p>
523   * This optimization is effective when there is a specific reasoning to exclude an entire region from the M-R job,
524   * (and hence, not contributing to the InputSplit), given the start and end keys of the same. <br>
525   * Useful when we need to remember the last-processed top record and revisit the [last, current) interval for M-R processing,
526   * continuously. In addition to reducing InputSplits, reduces the load on the region server as well, due to the ordering of the keys.
527   * <br>
528   * <br>
529   * Note: It is possible that <code>endKey.length() == 0 </code> , for the last (recent) region.
530   * <br>
531   * Override this method, if you want to bulk exclude regions altogether from M-R. By default, no region is excluded( i.e. all regions are included).
532   *
533   *
534   * @param startKey Start key of the region
535   * @param endKey End key of the region
536   * @return true, if this region needs to be included as part of the input (default).
537   *
538   */
539  protected boolean includeRegionInSplit(final byte[] startKey, final byte [] endKey) {
540    return true;
541  }
542
543  /**
544   * Allows subclasses to get the {@link RegionLocator}.
545   */
546  protected RegionLocator getRegionLocator() {
547    if (regionLocator == null) {
548      throw new IllegalStateException(NOT_INITIALIZED);
549    }
550    return regionLocator;
551  }
552
553  /**
554   * Allows subclasses to get the {@link Table}.
555   */
556  protected Table getTable() {
557    if (table == null) {
558      throw new IllegalStateException(NOT_INITIALIZED);
559    }
560    return table;
561  }
562
563  /**
564   * Allows subclasses to get the {@link Admin}.
565   */
566  protected Admin getAdmin() {
567    if (admin == null) {
568      throw new IllegalStateException(NOT_INITIALIZED);
569    }
570    return admin;
571  }
572
573  /**
574   * Allows subclasses to initialize the table information.
575   *
576   * @param connection  The Connection to the HBase cluster. MUST be unmanaged. We will close.
577   * @param tableName  The {@link TableName} of the table to process.
578   * @throws IOException
579   */
580  protected void initializeTable(Connection connection, TableName tableName) throws IOException {
581    if (this.table != null || this.connection != null) {
582      LOG.warn("initializeTable called multiple times. Overwriting connection and table " +
583          "reference; TableInputFormatBase will not close these old references when done.");
584    }
585    this.table = connection.getTable(tableName);
586    this.regionLocator = connection.getRegionLocator(tableName);
587    this.admin = connection.getAdmin();
588    this.connection = connection;
589  }
590
591  @VisibleForTesting
592  protected RegionSizeCalculator createRegionSizeCalculator(RegionLocator locator, Admin admin)
593      throws IOException {
594    return new RegionSizeCalculator(locator, admin);
595  }
596
597  /**
598   * Gets the scan defining the actual details like columns etc.
599   *
600   * @return The internal scan instance.
601   */
602  public Scan getScan() {
603    if (this.scan == null) this.scan = new Scan();
604    return scan;
605  }
606
607  /**
608   * Sets the scan defining the actual details like columns etc.
609   *
610   * @param scan  The scan to set.
611   */
612  public void setScan(Scan scan) {
613    this.scan = scan;
614  }
615
616  /**
617   * Allows subclasses to set the {@link TableRecordReader}.
618   *
619   * @param tableRecordReader A different {@link TableRecordReader}
620   *   implementation.
621   */
622  protected void setTableRecordReader(TableRecordReader tableRecordReader) {
623    this.tableRecordReader = tableRecordReader;
624  }
625
626  /**
627   * Handle subclass specific set up.
628   * Each of the entry points used by the MapReduce framework,
629   * {@link #createRecordReader(InputSplit, TaskAttemptContext)} and {@link #getSplits(JobContext)},
630   * will call {@link #initialize(JobContext)} as a convenient centralized location to handle
631   * retrieving the necessary configuration information and calling
632   * {@link #initializeTable(Connection, TableName)}.
633   *
634   * Subclasses should implement their initialize call such that it is safe to call multiple times.
635   * The current TableInputFormatBase implementation relies on a non-null table reference to decide
636   * if an initialize call is needed, but this behavior may change in the future. In particular,
637   * it is critical that initializeTable not be called multiple times since this will leak
638   * Connection instances.
639   *
640   */
641  protected void initialize(JobContext context) throws IOException {
642  }
643
644  /**
645   * Close the Table and related objects that were initialized via
646   * {@link #initializeTable(Connection, TableName)}.
647   *
648   * @throws IOException
649   */
650  protected void closeTable() throws IOException {
651    close(admin, table, regionLocator, connection);
652    admin = null;
653    table = null;
654    regionLocator = null;
655    connection = null;
656  }
657
658  private void close(Closeable... closables) throws IOException {
659    for (Closeable c : closables) {
660      if(c != null) { c.close(); }
661    }
662  }
663
664}