001/** 002 * 003 * Licensed to the Apache Software Foundation (ASF) under one 004 * or more contributor license agreements. See the NOTICE file 005 * distributed with this work for additional information 006 * regarding copyright ownership. The ASF licenses this file 007 * to you under the Apache License, Version 2.0 (the 008 * "License"); you may not use this file except in compliance 009 * with the License. You may obtain a copy of the License at 010 * 011 * http://www.apache.org/licenses/LICENSE-2.0 012 * 013 * Unless required by applicable law or agreed to in writing, software 014 * distributed under the License is distributed on an "AS IS" BASIS, 015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 016 * See the License for the specific language governing permissions and 017 * limitations under the License. 018 */ 019package org.apache.hadoop.hbase.mapreduce; 020 021import java.io.IOException; 022 023import org.apache.hadoop.hbase.CompareOperator; 024import org.apache.hadoop.hbase.HConstants; 025import org.apache.yetus.audience.InterfaceAudience; 026import org.slf4j.Logger; 027import org.slf4j.LoggerFactory; 028import org.apache.hadoop.conf.Configuration; 029import org.apache.hadoop.conf.Configured; 030import org.apache.hadoop.fs.Path; 031import org.apache.hadoop.hbase.Cell; 032import org.apache.hadoop.hbase.CellUtil; 033import org.apache.hadoop.hbase.HBaseConfiguration; 034import org.apache.hadoop.hbase.client.Result; 035import org.apache.hadoop.hbase.client.Scan; 036import org.apache.hadoop.hbase.filter.CompareFilter; 037import org.apache.hadoop.hbase.filter.Filter; 038import org.apache.hadoop.hbase.filter.PrefixFilter; 039import org.apache.hadoop.hbase.filter.RegexStringComparator; 040import org.apache.hadoop.hbase.filter.RowFilter; 041import org.apache.hadoop.hbase.io.ImmutableBytesWritable; 042import org.apache.hadoop.hbase.util.Bytes; 043import org.apache.hadoop.io.IntWritable; 044import org.apache.hadoop.io.Text; 045import org.apache.hadoop.mapreduce.Job; 046import org.apache.hadoop.mapreduce.Reducer; 047import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 048import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 049import org.apache.hadoop.util.Tool; 050import org.apache.hadoop.util.ToolRunner; 051 052import org.apache.hbase.thirdparty.com.google.common.base.Preconditions; 053 054 055/** 056 * A job with a a map and reduce phase to count cells in a table. 057 * The counter lists the following stats for a given table: 058 * <pre> 059 * 1. Total number of rows in the table 060 * 2. Total number of CFs across all rows 061 * 3. Total qualifiers across all rows 062 * 4. Total occurrence of each CF 063 * 5. Total occurrence of each qualifier 064 * 6. Total number of versions of each qualifier. 065 * </pre> 066 * 067 * The cellcounter can take optional parameters to use a user 068 * supplied row/family/qualifier string to use in the report and 069 * second a regex based or prefix based row filter to restrict the 070 * count operation to a limited subset of rows from the table or a 071 * start time and/or end time to limit the count to a time range. 072 */ 073@InterfaceAudience.Public 074public class CellCounter extends Configured implements Tool { 075 private static final Logger LOG = 076 LoggerFactory.getLogger(CellCounter.class.getName()); 077 078 079 /** 080 * Name of this 'program'. 081 */ 082 static final String NAME = "CellCounter"; 083 084 private final static String JOB_NAME_CONF_KEY = "mapreduce.job.name"; 085 086 /** 087 * Mapper that runs the count. 088 */ 089 static class CellCounterMapper 090 extends TableMapper<Text, IntWritable> { 091 /** 092 * Counter enumeration to count the actual rows. 093 */ 094 public static enum Counters { 095 ROWS, 096 CELLS 097 } 098 099 private Configuration conf; 100 private String separator; 101 102 // state of current row, family, column needs to persist across map() invocations 103 // in order to properly handle scanner batching, where a single qualifier may have too 104 // many versions for a single map() call 105 private byte[] lastRow; 106 private String currentRowKey; 107 byte[] currentFamily = null; 108 String currentFamilyName = null; 109 byte[] currentQualifier = null; 110 // family + qualifier 111 String currentQualifierName = null; 112 // rowkey + family + qualifier 113 String currentRowQualifierName = null; 114 115 @Override 116 protected void setup(Context context) throws IOException, InterruptedException { 117 conf = context.getConfiguration(); 118 separator = conf.get("ReportSeparator",":"); 119 } 120 121 /** 122 * Maps the data. 123 * 124 * @param row The current table row key. 125 * @param values The columns. 126 * @param context The current context. 127 * @throws IOException When something is broken with the data. 128 */ 129 130 @Override 131 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NP_NULL_ON_SOME_PATH", 132 justification="Findbugs is blind to the Precondition null check") 133 public void map(ImmutableBytesWritable row, Result values, 134 Context context) 135 throws IOException { 136 Preconditions.checkState(values != null, 137 "values passed to the map is null"); 138 139 try { 140 byte[] currentRow = values.getRow(); 141 if (lastRow == null || !Bytes.equals(lastRow, currentRow)) { 142 lastRow = currentRow; 143 currentRowKey = Bytes.toStringBinary(currentRow); 144 currentFamily = null; 145 currentQualifier = null; 146 context.getCounter(Counters.ROWS).increment(1); 147 context.write(new Text("Total ROWS"), new IntWritable(1)); 148 } 149 if (!values.isEmpty()) { 150 int cellCount = 0; 151 for (Cell value : values.listCells()) { 152 cellCount++; 153 if (currentFamily == null || !CellUtil.matchingFamily(value, currentFamily)) { 154 currentFamily = CellUtil.cloneFamily(value); 155 currentFamilyName = Bytes.toStringBinary(currentFamily); 156 currentQualifier = null; 157 context.getCounter("CF", currentFamilyName).increment(1); 158 if (1 == context.getCounter("CF", currentFamilyName).getValue()) { 159 context.write(new Text("Total Families Across all Rows"), new IntWritable(1)); 160 context.write(new Text(currentFamily), new IntWritable(1)); 161 } 162 } 163 if (currentQualifier == null || !CellUtil.matchingQualifier(value, currentQualifier)) { 164 currentQualifier = CellUtil.cloneQualifier(value); 165 currentQualifierName = currentFamilyName + separator + 166 Bytes.toStringBinary(currentQualifier); 167 currentRowQualifierName = currentRowKey + separator + currentQualifierName; 168 169 context.write(new Text("Total Qualifiers across all Rows"), 170 new IntWritable(1)); 171 context.write(new Text(currentQualifierName), new IntWritable(1)); 172 } 173 // Increment versions 174 context.write(new Text(currentRowQualifierName + "_Versions"), new IntWritable(1)); 175 } 176 context.getCounter(Counters.CELLS).increment(cellCount); 177 } 178 } catch (InterruptedException e) { 179 e.printStackTrace(); 180 } 181 } 182 } 183 184 static class IntSumReducer<Key> extends Reducer<Key, IntWritable, 185 Key, IntWritable> { 186 187 private IntWritable result = new IntWritable(); 188 public void reduce(Key key, Iterable<IntWritable> values, 189 Context context) 190 throws IOException, InterruptedException { 191 int sum = 0; 192 for (IntWritable val : values) { 193 sum += val.get(); 194 } 195 result.set(sum); 196 context.write(key, result); 197 } 198 } 199 200 /** 201 * Sets up the actual job. 202 * 203 * @param conf The current configuration. 204 * @param args The command line parameters. 205 * @return The newly created job. 206 * @throws IOException When setting up the job fails. 207 */ 208 public static Job createSubmittableJob(Configuration conf, String[] args) 209 throws IOException { 210 String tableName = args[0]; 211 Path outputDir = new Path(args[1]); 212 String reportSeparatorString = (args.length > 2) ? args[2]: ":"; 213 conf.set("ReportSeparator", reportSeparatorString); 214 Job job = Job.getInstance(conf, conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName)); 215 job.setJarByClass(CellCounter.class); 216 Scan scan = getConfiguredScanForJob(conf, args); 217 TableMapReduceUtil.initTableMapperJob(tableName, scan, 218 CellCounterMapper.class, ImmutableBytesWritable.class, Result.class, job); 219 job.setNumReduceTasks(1); 220 job.setMapOutputKeyClass(Text.class); 221 job.setMapOutputValueClass(IntWritable.class); 222 job.setOutputFormatClass(TextOutputFormat.class); 223 job.setOutputKeyClass(Text.class); 224 job.setOutputValueClass(IntWritable.class); 225 FileOutputFormat.setOutputPath(job, outputDir); 226 job.setReducerClass(IntSumReducer.class); 227 return job; 228 } 229 230 private static Scan getConfiguredScanForJob(Configuration conf, String[] args) 231 throws IOException { 232 // create scan with any properties set from TableInputFormat 233 Scan s = TableInputFormat.createScanFromConfiguration(conf); 234 // Set Scan Versions 235 if (conf.get(TableInputFormat.SCAN_MAXVERSIONS) == null) { 236 // default to all versions unless explicitly set 237 s.setMaxVersions(Integer.MAX_VALUE); 238 } 239 s.setCacheBlocks(false); 240 // Set RowFilter or Prefix Filter if applicable. 241 Filter rowFilter = getRowFilter(args); 242 if (rowFilter!= null) { 243 LOG.info("Setting Row Filter for counter."); 244 s.setFilter(rowFilter); 245 } 246 // Set TimeRange if defined 247 long timeRange[] = getTimeRange(args); 248 if (timeRange != null) { 249 LOG.info("Setting TimeRange for counter."); 250 s.setTimeRange(timeRange[0], timeRange[1]); 251 } 252 return s; 253 } 254 255 256 private static Filter getRowFilter(String[] args) { 257 Filter rowFilter = null; 258 String filterCriteria = (args.length > 3) ? args[3]: null; 259 if (filterCriteria == null) return null; 260 if (filterCriteria.startsWith("^")) { 261 String regexPattern = filterCriteria.substring(1, filterCriteria.length()); 262 rowFilter = new RowFilter(CompareOperator.EQUAL, new RegexStringComparator(regexPattern)); 263 } else { 264 rowFilter = new PrefixFilter(Bytes.toBytesBinary(filterCriteria)); 265 } 266 return rowFilter; 267 } 268 269 private static long[] getTimeRange(String[] args) throws IOException { 270 final String startTimeArgKey = "--starttime="; 271 final String endTimeArgKey = "--endtime="; 272 long startTime = 0L; 273 long endTime = 0L; 274 275 for (int i = 1; i < args.length; i++) { 276 System.out.println("i:" + i + "arg[i]" + args[i]); 277 if (args[i].startsWith(startTimeArgKey)) { 278 startTime = Long.parseLong(args[i].substring(startTimeArgKey.length())); 279 } 280 if (args[i].startsWith(endTimeArgKey)) { 281 endTime = Long.parseLong(args[i].substring(endTimeArgKey.length())); 282 } 283 } 284 285 if (startTime == 0 && endTime == 0) 286 return null; 287 288 endTime = endTime == 0 ? HConstants.LATEST_TIMESTAMP : endTime; 289 return new long [] {startTime, endTime}; 290 } 291 292 @Override 293 public int run(String[] args) throws Exception { 294 if (args.length < 2) { 295 printUsage(args.length); 296 return -1; 297 } 298 Job job = createSubmittableJob(getConf(), args); 299 return (job.waitForCompletion(true) ? 0 : 1); 300 } 301 302 private void printUsage(int parameterCount) { 303 System.err.println("ERROR: Wrong number of parameters: " + parameterCount); 304 System.err.println("Usage: hbase cellcounter <tablename> <outputDir> [reportSeparator] " 305 + "[^[regex pattern] or [Prefix]] [--starttime=<starttime> --endtime=<endtime>]"); 306 System.err.println(" Note: -D properties will be applied to the conf used."); 307 System.err.println(" Additionally, all of the SCAN properties from TableInputFormat can be " 308 + "specified to get fine grained control on what is counted."); 309 System.err.println(" -D" + TableInputFormat.SCAN_ROW_START + "=<rowkey>"); 310 System.err.println(" -D" + TableInputFormat.SCAN_ROW_STOP + "=<rowkey>"); 311 System.err.println(" -D" + TableInputFormat.SCAN_COLUMNS + "=\"<col1> <col2>...\""); 312 System.err.println(" -D" + TableInputFormat.SCAN_COLUMN_FAMILY 313 + "=<family1>,<family2>, ..."); 314 System.err.println(" -D" + TableInputFormat.SCAN_TIMESTAMP + "=<timestamp>"); 315 System.err.println(" -D" + TableInputFormat.SCAN_TIMERANGE_START + "=<timestamp>"); 316 System.err.println(" -D" + TableInputFormat.SCAN_TIMERANGE_END + "=<timestamp>"); 317 System.err.println(" -D" + TableInputFormat.SCAN_MAXVERSIONS + "=<count>"); 318 System.err.println(" -D" + TableInputFormat.SCAN_CACHEDROWS + "=<count>"); 319 System.err.println(" -D" + TableInputFormat.SCAN_BATCHSIZE + "=<count>"); 320 System.err.println(" <reportSeparator> parameter can be used to override the default report " 321 + "separator string : used to separate the rowId/column family name and qualifier name."); 322 System.err.println(" [^[regex pattern] or [Prefix] parameter can be used to limit the cell " 323 + "counter count operation to a limited subset of rows from the table based on regex or " 324 + "prefix pattern."); 325 } 326 327 /** 328 * Main entry point. 329 * @param args The command line parameters. 330 * @throws Exception When running the job fails. 331 */ 332 public static void main(String[] args) throws Exception { 333 int errCode = ToolRunner.run(HBaseConfiguration.create(), new CellCounter(), args); 334 System.exit(errCode); 335 } 336 337}