001/** 002 * 003 * Licensed to the Apache Software Foundation (ASF) under one 004 * or more contributor license agreements. See the NOTICE file 005 * distributed with this work for additional information 006 * regarding copyright ownership. The ASF licenses this file 007 * to you under the Apache License, Version 2.0 (the 008 * "License"); you may not use this file except in compliance 009 * with the License. You may obtain a copy of the License at 010 * 011 * http://www.apache.org/licenses/LICENSE-2.0 012 * 013 * Unless required by applicable law or agreed to in writing, software 014 * distributed under the License is distributed on an "AS IS" BASIS, 015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 016 * See the License for the specific language governing permissions and 017 * limitations under the License. 018 */ 019package org.apache.hadoop.hbase.mapreduce; 020 021import java.io.Closeable; 022import java.io.IOException; 023import java.net.InetAddress; 024import java.net.InetSocketAddress; 025import java.net.UnknownHostException; 026import java.util.ArrayList; 027import java.util.HashMap; 028import java.util.List; 029import java.util.Map; 030 031import org.apache.yetus.audience.InterfaceAudience; 032import org.slf4j.Logger; 033import org.slf4j.LoggerFactory; 034import org.apache.hadoop.hbase.HConstants; 035import org.apache.hadoop.hbase.HRegionLocation; 036import org.apache.hadoop.hbase.TableName; 037import org.apache.hadoop.hbase.client.Admin; 038import org.apache.hadoop.hbase.client.Connection; 039import org.apache.hadoop.hbase.client.RegionLocator; 040import org.apache.hadoop.hbase.client.Result; 041import org.apache.hadoop.hbase.client.Scan; 042import org.apache.hadoop.hbase.client.Table; 043import org.apache.hadoop.hbase.exceptions.IllegalArgumentIOException; 044import org.apache.hadoop.hbase.io.ImmutableBytesWritable; 045import org.apache.hadoop.hbase.util.Addressing; 046import org.apache.hadoop.hbase.util.Bytes; 047import org.apache.hadoop.hbase.util.Pair; 048import org.apache.hadoop.hbase.util.Strings; 049import org.apache.hadoop.mapreduce.InputFormat; 050import org.apache.hadoop.mapreduce.InputSplit; 051import org.apache.hadoop.mapreduce.JobContext; 052import org.apache.hadoop.mapreduce.RecordReader; 053import org.apache.hadoop.mapreduce.TaskAttemptContext; 054import org.apache.hadoop.net.DNS; 055import org.apache.hadoop.util.StringUtils; 056import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting; 057 058/** 059 * A base for {@link TableInputFormat}s. Receives a {@link Connection}, a {@link TableName}, 060 * an {@link Scan} instance that defines the input columns etc. Subclasses may use 061 * other TableRecordReader implementations. 062 * 063 * Subclasses MUST ensure initializeTable(Connection, TableName) is called for an instance to 064 * function properly. Each of the entry points to this class used by the MapReduce framework, 065 * {@link #createRecordReader(InputSplit, TaskAttemptContext)} and {@link #getSplits(JobContext)}, 066 * will call {@link #initialize(JobContext)} as a convenient centralized location to handle 067 * retrieving the necessary configuration information. If your subclass overrides either of these 068 * methods, either call the parent version or call initialize yourself. 069 * 070 * <p> 071 * An example of a subclass: 072 * <pre> 073 * class ExampleTIF extends TableInputFormatBase { 074 * 075 * {@literal @}Override 076 * protected void initialize(JobContext context) throws IOException { 077 * // We are responsible for the lifecycle of this connection until we hand it over in 078 * // initializeTable. 079 * Connection connection = ConnectionFactory.createConnection(HBaseConfiguration.create( 080 * job.getConfiguration())); 081 * TableName tableName = TableName.valueOf("exampleTable"); 082 * // mandatory. once passed here, TableInputFormatBase will handle closing the connection. 083 * initializeTable(connection, tableName); 084 * byte[][] inputColumns = new byte [][] { Bytes.toBytes("columnA"), 085 * Bytes.toBytes("columnB") }; 086 * // optional, by default we'll get everything for the table. 087 * Scan scan = new Scan(); 088 * for (byte[] family : inputColumns) { 089 * scan.addFamily(family); 090 * } 091 * Filter exampleFilter = new RowFilter(CompareOp.EQUAL, new RegexStringComparator("aa.*")); 092 * scan.setFilter(exampleFilter); 093 * setScan(scan); 094 * } 095 * } 096 * </pre> 097 * 098 * 099 * The number of InputSplits(mappers) match the number of regions in a table by default. 100 * Set "hbase.mapreduce.tableinput.mappers.per.region" to specify how many mappers per region, set 101 * this property will disable autobalance below.\ 102 * Set "hbase.mapreduce.tif.input.autobalance" to enable autobalance, hbase will assign mappers 103 * based on average region size; For regions, whose size larger than average region size may assigned 104 * more mappers, and for smaller one, they may group together to use one mapper. If actual average 105 * region size is too big, like 50G, it is not good to only assign 1 mapper for those large regions. 106 * Use "hbase.mapreduce.tif.ave.regionsize" to set max average region size when enable "autobalanece", 107 * default mas average region size is 8G. 108 */ 109@InterfaceAudience.Public 110public abstract class TableInputFormatBase 111 extends InputFormat<ImmutableBytesWritable, Result> { 112 113 private static final Logger LOG = LoggerFactory.getLogger(TableInputFormatBase.class); 114 115 private static final String NOT_INITIALIZED = "The input format instance has not been properly " + 116 "initialized. Ensure you call initializeTable either in your constructor or initialize " + 117 "method"; 118 private static final String INITIALIZATION_ERROR = "Cannot create a record reader because of a" + 119 " previous error. Please look at the previous logs lines from" + 120 " the task's full log for more details."; 121 122 /** Specify if we enable auto-balance to set number of mappers in M/R jobs. */ 123 public static final String MAPREDUCE_INPUT_AUTOBALANCE = "hbase.mapreduce.tif.input.autobalance"; 124 /** In auto-balance, we split input by ave region size, if calculated region size is too big, we can set it. */ 125 public static final String MAX_AVERAGE_REGION_SIZE = "hbase.mapreduce.tif.ave.regionsize"; 126 127 /** Set the number of Mappers for each region, all regions have same number of Mappers */ 128 public static final String NUM_MAPPERS_PER_REGION = "hbase.mapreduce.tableinput.mappers.per.region"; 129 130 131 /** Holds the details for the internal scanner. 132 * 133 * @see Scan */ 134 private Scan scan = null; 135 /** The {@link Admin}. */ 136 private Admin admin; 137 /** The {@link Table} to scan. */ 138 private Table table; 139 /** The {@link RegionLocator} of the table. */ 140 private RegionLocator regionLocator; 141 /** The reader scanning the table, can be a custom one. */ 142 private TableRecordReader tableRecordReader = null; 143 /** The underlying {@link Connection} of the table. */ 144 private Connection connection; 145 146 147 /** The reverse DNS lookup cache mapping: IPAddress => HostName */ 148 private HashMap<InetAddress, String> reverseDNSCacheMap = 149 new HashMap<>(); 150 151 /** 152 * Builds a {@link TableRecordReader}. If no {@link TableRecordReader} was provided, uses 153 * the default. 154 * 155 * @param split The split to work with. 156 * @param context The current context. 157 * @return The newly created record reader. 158 * @throws IOException When creating the reader fails. 159 * @see org.apache.hadoop.mapreduce.InputFormat#createRecordReader( 160 * org.apache.hadoop.mapreduce.InputSplit, 161 * org.apache.hadoop.mapreduce.TaskAttemptContext) 162 */ 163 @Override 164 public RecordReader<ImmutableBytesWritable, Result> createRecordReader( 165 InputSplit split, TaskAttemptContext context) 166 throws IOException { 167 // Just in case a subclass is relying on JobConfigurable magic. 168 if (table == null) { 169 initialize(context); 170 } 171 // null check in case our child overrides getTable to not throw. 172 try { 173 if (getTable() == null) { 174 // initialize() must not have been implemented in the subclass. 175 throw new IOException(INITIALIZATION_ERROR); 176 } 177 } catch (IllegalStateException exception) { 178 throw new IOException(INITIALIZATION_ERROR, exception); 179 } 180 TableSplit tSplit = (TableSplit) split; 181 LOG.info("Input split length: " + StringUtils.humanReadableInt(tSplit.getLength()) + " bytes."); 182 final TableRecordReader trr = 183 this.tableRecordReader != null ? this.tableRecordReader : new TableRecordReader(); 184 Scan sc = new Scan(this.scan); 185 sc.setStartRow(tSplit.getStartRow()); 186 sc.setStopRow(tSplit.getEndRow()); 187 trr.setScan(sc); 188 trr.setTable(getTable()); 189 return new RecordReader<ImmutableBytesWritable, Result>() { 190 191 @Override 192 public void close() throws IOException { 193 trr.close(); 194 closeTable(); 195 } 196 197 @Override 198 public ImmutableBytesWritable getCurrentKey() throws IOException, InterruptedException { 199 return trr.getCurrentKey(); 200 } 201 202 @Override 203 public Result getCurrentValue() throws IOException, InterruptedException { 204 return trr.getCurrentValue(); 205 } 206 207 @Override 208 public float getProgress() throws IOException, InterruptedException { 209 return trr.getProgress(); 210 } 211 212 @Override 213 public void initialize(InputSplit inputsplit, TaskAttemptContext context) throws IOException, 214 InterruptedException { 215 trr.initialize(inputsplit, context); 216 } 217 218 @Override 219 public boolean nextKeyValue() throws IOException, InterruptedException { 220 return trr.nextKeyValue(); 221 } 222 }; 223 } 224 225 protected Pair<byte[][],byte[][]> getStartEndKeys() throws IOException { 226 return getRegionLocator().getStartEndKeys(); 227 } 228 229 /** 230 * Calculates the splits that will serve as input for the map tasks. 231 * @param context The current job context. 232 * @return The list of input splits. 233 * @throws IOException When creating the list of splits fails. 234 * @see org.apache.hadoop.mapreduce.InputFormat#getSplits( 235 * org.apache.hadoop.mapreduce.JobContext) 236 */ 237 @Override 238 public List<InputSplit> getSplits(JobContext context) throws IOException { 239 boolean closeOnFinish = false; 240 241 // Just in case a subclass is relying on JobConfigurable magic. 242 if (table == null) { 243 initialize(context); 244 closeOnFinish = true; 245 } 246 247 // null check in case our child overrides getTable to not throw. 248 try { 249 if (getTable() == null) { 250 // initialize() must not have been implemented in the subclass. 251 throw new IOException(INITIALIZATION_ERROR); 252 } 253 } catch (IllegalStateException exception) { 254 throw new IOException(INITIALIZATION_ERROR, exception); 255 } 256 257 try { 258 List<InputSplit> splits = oneInputSplitPerRegion(); 259 260 // set same number of mappers for each region 261 if (context.getConfiguration().get(NUM_MAPPERS_PER_REGION) != null) { 262 int nSplitsPerRegion = context.getConfiguration().getInt(NUM_MAPPERS_PER_REGION, 1); 263 List<InputSplit> res = new ArrayList<>(); 264 for (int i = 0; i < splits.size(); i++) { 265 List<InputSplit> tmp = createNInputSplitsUniform(splits.get(i), nSplitsPerRegion); 266 res.addAll(tmp); 267 } 268 return res; 269 } 270 271 //The default value of "hbase.mapreduce.input.autobalance" is false. 272 if (context.getConfiguration().getBoolean(MAPREDUCE_INPUT_AUTOBALANCE, false)) { 273 long maxAveRegionSize = context.getConfiguration() 274 .getLong(MAX_AVERAGE_REGION_SIZE, 8L*1073741824); //8GB 275 return calculateAutoBalancedSplits(splits, maxAveRegionSize); 276 } 277 278 // return one mapper per region 279 return splits; 280 } finally { 281 if (closeOnFinish) { 282 closeTable(); 283 } 284 } 285 } 286 287 /** 288 * Create one InputSplit per region 289 * 290 * @return The list of InputSplit for all the regions 291 * @throws IOException 292 */ 293 private List<InputSplit> oneInputSplitPerRegion() throws IOException { 294 RegionSizeCalculator sizeCalculator = 295 createRegionSizeCalculator(getRegionLocator(), getAdmin()); 296 297 TableName tableName = getTable().getName(); 298 299 Pair<byte[][], byte[][]> keys = getStartEndKeys(); 300 if (keys == null || keys.getFirst() == null || 301 keys.getFirst().length == 0) { 302 HRegionLocation regLoc = 303 getRegionLocator().getRegionLocation(HConstants.EMPTY_BYTE_ARRAY, false); 304 if (null == regLoc) { 305 throw new IOException("Expecting at least one region."); 306 } 307 List<InputSplit> splits = new ArrayList<>(1); 308 long regionSize = sizeCalculator.getRegionSize(regLoc.getRegionInfo().getRegionName()); 309 TableSplit split = new TableSplit(tableName, scan, 310 HConstants.EMPTY_BYTE_ARRAY, HConstants.EMPTY_BYTE_ARRAY, regLoc 311 .getHostnamePort().split(Addressing.HOSTNAME_PORT_SEPARATOR)[0], regionSize); 312 splits.add(split); 313 return splits; 314 } 315 List<InputSplit> splits = new ArrayList<>(keys.getFirst().length); 316 for (int i = 0; i < keys.getFirst().length; i++) { 317 if (!includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) { 318 continue; 319 } 320 321 byte[] startRow = scan.getStartRow(); 322 byte[] stopRow = scan.getStopRow(); 323 // determine if the given start an stop key fall into the region 324 if ((startRow.length == 0 || keys.getSecond()[i].length == 0 || 325 Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) && 326 (stopRow.length == 0 || 327 Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) { 328 byte[] splitStart = startRow.length == 0 || 329 Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ? 330 keys.getFirst()[i] : startRow; 331 byte[] splitStop = (stopRow.length == 0 || 332 Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) && 333 keys.getSecond()[i].length > 0 ? 334 keys.getSecond()[i] : stopRow; 335 336 HRegionLocation location = getRegionLocator().getRegionLocation(keys.getFirst()[i], false); 337 // The below InetSocketAddress creation does a name resolution. 338 InetSocketAddress isa = new InetSocketAddress(location.getHostname(), location.getPort()); 339 if (isa.isUnresolved()) { 340 LOG.warn("Failed resolve " + isa); 341 } 342 InetAddress regionAddress = isa.getAddress(); 343 String regionLocation; 344 regionLocation = reverseDNS(regionAddress); 345 346 byte[] regionName = location.getRegionInfo().getRegionName(); 347 String encodedRegionName = location.getRegionInfo().getEncodedName(); 348 long regionSize = sizeCalculator.getRegionSize(regionName); 349 TableSplit split = new TableSplit(tableName, scan, 350 splitStart, splitStop, regionLocation, encodedRegionName, regionSize); 351 splits.add(split); 352 if (LOG.isDebugEnabled()) { 353 LOG.debug("getSplits: split -> " + i + " -> " + split); 354 } 355 } 356 } 357 return splits; 358 } 359 360 /** 361 * Create n splits for one InputSplit, For now only support uniform distribution 362 * @param split A TableSplit corresponding to a range of rowkeys 363 * @param n Number of ranges after splitting. Pass 1 means no split for the range 364 * Pass 2 if you want to split the range in two; 365 * @return A list of TableSplit, the size of the list is n 366 * @throws IllegalArgumentIOException 367 */ 368 protected List<InputSplit> createNInputSplitsUniform(InputSplit split, int n) 369 throws IllegalArgumentIOException { 370 if (split == null || !(split instanceof TableSplit)) { 371 throw new IllegalArgumentIOException( 372 "InputSplit for CreateNSplitsPerRegion can not be null + " 373 + "and should be instance of TableSplit"); 374 } 375 //if n < 1, then still continue using n = 1 376 n = n < 1 ? 1 : n; 377 List<InputSplit> res = new ArrayList<>(n); 378 if (n == 1) { 379 res.add(split); 380 return res; 381 } 382 383 // Collect Region related information 384 TableSplit ts = (TableSplit) split; 385 TableName tableName = ts.getTable(); 386 String regionLocation = ts.getRegionLocation(); 387 String encodedRegionName = ts.getEncodedRegionName(); 388 long regionSize = ts.getLength(); 389 byte[] startRow = ts.getStartRow(); 390 byte[] endRow = ts.getEndRow(); 391 392 // For special case: startRow or endRow is empty 393 if (startRow.length == 0 && endRow.length == 0){ 394 startRow = new byte[1]; 395 endRow = new byte[1]; 396 startRow[0] = 0; 397 endRow[0] = -1; 398 } 399 if (startRow.length == 0 && endRow.length != 0){ 400 startRow = new byte[1]; 401 startRow[0] = 0; 402 } 403 if (startRow.length != 0 && endRow.length == 0){ 404 endRow =new byte[startRow.length]; 405 for (int k = 0; k < startRow.length; k++){ 406 endRow[k] = -1; 407 } 408 } 409 410 // Split Region into n chunks evenly 411 byte[][] splitKeys = Bytes.split(startRow, endRow, true, n-1); 412 for (int i = 0; i < splitKeys.length - 1; i++) { 413 //notice that the regionSize parameter may be not very accurate 414 TableSplit tsplit = 415 new TableSplit(tableName, scan, splitKeys[i], splitKeys[i + 1], regionLocation, 416 encodedRegionName, regionSize / n); 417 res.add(tsplit); 418 } 419 return res; 420 } 421 /** 422 * Calculates the number of MapReduce input splits for the map tasks. The number of 423 * MapReduce input splits depends on the average region size. 424 * Make it 'public' for testing 425 * 426 * @param splits The list of input splits before balance. 427 * @param maxAverageRegionSize max Average region size for one mapper 428 * @return The list of input splits. 429 * @throws IOException When creating the list of splits fails. 430 * @see org.apache.hadoop.mapreduce.InputFormat#getSplits( 431 *org.apache.hadoop.mapreduce.JobContext) 432 */ 433 public List<InputSplit> calculateAutoBalancedSplits(List<InputSplit> splits, long maxAverageRegionSize) 434 throws IOException { 435 if (splits.size() == 0) { 436 return splits; 437 } 438 List<InputSplit> resultList = new ArrayList<>(); 439 long totalRegionSize = 0; 440 for (int i = 0; i < splits.size(); i++) { 441 TableSplit ts = (TableSplit) splits.get(i); 442 totalRegionSize += ts.getLength(); 443 } 444 long averageRegionSize = totalRegionSize / splits.size(); 445 // totalRegionSize might be overflow, and the averageRegionSize must be positive. 446 if (averageRegionSize <= 0) { 447 LOG.warn("The averageRegionSize is not positive: " + averageRegionSize + ", " + 448 "set it to Long.MAX_VALUE " + splits.size()); 449 averageRegionSize = Long.MAX_VALUE / splits.size(); 450 } 451 //if averageRegionSize is too big, change it to default as 1 GB, 452 if (averageRegionSize > maxAverageRegionSize) { 453 averageRegionSize = maxAverageRegionSize; 454 } 455 // if averageRegionSize is too small, we do not need to allocate more mappers for those 'large' region 456 // set default as 16M = (default hdfs block size) / 4; 457 if (averageRegionSize < 16 * 1048576) { 458 return splits; 459 } 460 for (int i = 0; i < splits.size(); i++) { 461 TableSplit ts = (TableSplit) splits.get(i); 462 TableName tableName = ts.getTable(); 463 String regionLocation = ts.getRegionLocation(); 464 String encodedRegionName = ts.getEncodedRegionName(); 465 long regionSize = ts.getLength(); 466 467 if (regionSize >= averageRegionSize) { 468 // make this region as multiple MapReduce input split. 469 int n = (int) Math.round(Math.log(((double) regionSize) / ((double) averageRegionSize)) + 1.0); 470 List<InputSplit> temp = createNInputSplitsUniform(ts, n); 471 resultList.addAll(temp); 472 } else { 473 // if the total size of several small continuous regions less than the average region size, 474 // combine them into one MapReduce input split. 475 long totalSize = regionSize; 476 byte[] splitStartKey = ts.getStartRow(); 477 byte[] splitEndKey = ts.getEndRow(); 478 int j = i + 1; 479 while (j < splits.size()) { 480 TableSplit nextRegion = (TableSplit) splits.get(j); 481 long nextRegionSize = nextRegion.getLength(); 482 if (totalSize + nextRegionSize <= averageRegionSize 483 && Bytes.equals(splitEndKey, nextRegion.getStartRow())) { 484 totalSize = totalSize + nextRegionSize; 485 splitEndKey = nextRegion.getEndRow(); 486 j++; 487 } else { 488 break; 489 } 490 } 491 i = j - 1; 492 TableSplit t = new TableSplit(tableName, scan, splitStartKey, splitEndKey, regionLocation, 493 encodedRegionName, totalSize); 494 resultList.add(t); 495 } 496 } 497 return resultList; 498 } 499 500 String reverseDNS(InetAddress ipAddress) throws UnknownHostException { 501 String hostName = this.reverseDNSCacheMap.get(ipAddress); 502 if (hostName == null) { 503 String ipAddressString = null; 504 try { 505 ipAddressString = DNS.reverseDns(ipAddress, null); 506 } catch (Exception e) { 507 // We can use InetAddress in case the jndi failed to pull up the reverse DNS entry from the 508 // name service. Also, in case of ipv6, we need to use the InetAddress since resolving 509 // reverse DNS using jndi doesn't work well with ipv6 addresses. 510 ipAddressString = InetAddress.getByName(ipAddress.getHostAddress()).getHostName(); 511 } 512 if (ipAddressString == null) throw new UnknownHostException("No host found for " + ipAddress); 513 hostName = Strings.domainNamePointerToHostName(ipAddressString); 514 this.reverseDNSCacheMap.put(ipAddress, hostName); 515 } 516 return hostName; 517 } 518 519 /** 520 * Test if the given region is to be included in the InputSplit while splitting 521 * the regions of a table. 522 * <p> 523 * This optimization is effective when there is a specific reasoning to exclude an entire region from the M-R job, 524 * (and hence, not contributing to the InputSplit), given the start and end keys of the same. <br> 525 * Useful when we need to remember the last-processed top record and revisit the [last, current) interval for M-R processing, 526 * continuously. In addition to reducing InputSplits, reduces the load on the region server as well, due to the ordering of the keys. 527 * <br> 528 * <br> 529 * Note: It is possible that <code>endKey.length() == 0 </code> , for the last (recent) region. 530 * <br> 531 * Override this method, if you want to bulk exclude regions altogether from M-R. By default, no region is excluded( i.e. all regions are included). 532 * 533 * 534 * @param startKey Start key of the region 535 * @param endKey End key of the region 536 * @return true, if this region needs to be included as part of the input (default). 537 * 538 */ 539 protected boolean includeRegionInSplit(final byte[] startKey, final byte [] endKey) { 540 return true; 541 } 542 543 /** 544 * Allows subclasses to get the {@link RegionLocator}. 545 */ 546 protected RegionLocator getRegionLocator() { 547 if (regionLocator == null) { 548 throw new IllegalStateException(NOT_INITIALIZED); 549 } 550 return regionLocator; 551 } 552 553 /** 554 * Allows subclasses to get the {@link Table}. 555 */ 556 protected Table getTable() { 557 if (table == null) { 558 throw new IllegalStateException(NOT_INITIALIZED); 559 } 560 return table; 561 } 562 563 /** 564 * Allows subclasses to get the {@link Admin}. 565 */ 566 protected Admin getAdmin() { 567 if (admin == null) { 568 throw new IllegalStateException(NOT_INITIALIZED); 569 } 570 return admin; 571 } 572 573 /** 574 * Allows subclasses to initialize the table information. 575 * 576 * @param connection The Connection to the HBase cluster. MUST be unmanaged. We will close. 577 * @param tableName The {@link TableName} of the table to process. 578 * @throws IOException 579 */ 580 protected void initializeTable(Connection connection, TableName tableName) throws IOException { 581 if (this.table != null || this.connection != null) { 582 LOG.warn("initializeTable called multiple times. Overwriting connection and table " + 583 "reference; TableInputFormatBase will not close these old references when done."); 584 } 585 this.table = connection.getTable(tableName); 586 this.regionLocator = connection.getRegionLocator(tableName); 587 this.admin = connection.getAdmin(); 588 this.connection = connection; 589 } 590 591 @VisibleForTesting 592 protected RegionSizeCalculator createRegionSizeCalculator(RegionLocator locator, Admin admin) 593 throws IOException { 594 return new RegionSizeCalculator(locator, admin); 595 } 596 597 /** 598 * Gets the scan defining the actual details like columns etc. 599 * 600 * @return The internal scan instance. 601 */ 602 public Scan getScan() { 603 if (this.scan == null) this.scan = new Scan(); 604 return scan; 605 } 606 607 /** 608 * Sets the scan defining the actual details like columns etc. 609 * 610 * @param scan The scan to set. 611 */ 612 public void setScan(Scan scan) { 613 this.scan = scan; 614 } 615 616 /** 617 * Allows subclasses to set the {@link TableRecordReader}. 618 * 619 * @param tableRecordReader A different {@link TableRecordReader} 620 * implementation. 621 */ 622 protected void setTableRecordReader(TableRecordReader tableRecordReader) { 623 this.tableRecordReader = tableRecordReader; 624 } 625 626 /** 627 * Handle subclass specific set up. 628 * Each of the entry points used by the MapReduce framework, 629 * {@link #createRecordReader(InputSplit, TaskAttemptContext)} and {@link #getSplits(JobContext)}, 630 * will call {@link #initialize(JobContext)} as a convenient centralized location to handle 631 * retrieving the necessary configuration information and calling 632 * {@link #initializeTable(Connection, TableName)}. 633 * 634 * Subclasses should implement their initialize call such that it is safe to call multiple times. 635 * The current TableInputFormatBase implementation relies on a non-null table reference to decide 636 * if an initialize call is needed, but this behavior may change in the future. In particular, 637 * it is critical that initializeTable not be called multiple times since this will leak 638 * Connection instances. 639 * 640 */ 641 protected void initialize(JobContext context) throws IOException { 642 } 643 644 /** 645 * Close the Table and related objects that were initialized via 646 * {@link #initializeTable(Connection, TableName)}. 647 * 648 * @throws IOException 649 */ 650 protected void closeTable() throws IOException { 651 close(admin, table, regionLocator, connection); 652 admin = null; 653 table = null; 654 regionLocator = null; 655 connection = null; 656 } 657 658 private void close(Closeable... closables) throws IOException { 659 for (Closeable c : closables) { 660 if(c != null) { c.close(); } 661 } 662 } 663 664}