001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * <p> 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * <p> 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.regionserver.querymatcher; 019 020import java.io.IOException; 021import java.util.HashMap; 022import java.util.Map; 023import java.util.NavigableMap; 024import java.util.NavigableSet; 025import java.util.SortedMap; 026import java.util.SortedSet; 027import java.util.TreeMap; 028import java.util.TreeSet; 029 030import org.apache.hadoop.hbase.Cell; 031import org.apache.hadoop.hbase.CellComparator; 032import org.apache.hadoop.hbase.CellUtil; 033import org.apache.hadoop.hbase.PrivateCellUtil; 034import org.apache.hadoop.hbase.KeyValue.Type; 035import org.apache.yetus.audience.InterfaceAudience; 036import org.apache.hadoop.hbase.regionserver.querymatcher.ScanQueryMatcher.MatchCode; 037import org.apache.hadoop.hbase.util.Bytes; 038 039/** 040 * A tracker both implementing ColumnTracker and DeleteTracker, used for mvcc-sensitive scanning. 041 * We should make sure in one QueryMatcher the ColumnTracker and DeleteTracker is the same instance. 042 */ 043@InterfaceAudience.Private 044public class NewVersionBehaviorTracker implements ColumnTracker, DeleteTracker { 045 046 private byte[] lastCqArray; 047 private int lastCqLength; 048 private int lastCqOffset; 049 private long lastCqTs; 050 private long lastCqMvcc; 051 private byte lastCqType; 052 private int columnIndex; 053 private int countCurrentCol; 054 055 protected int maxVersions; 056 private int resultMaxVersions; 057 private byte[][] columns; 058 private int minVersions; 059 private long oldestStamp; 060 private CellComparator comparator; 061 062 // These two maps have same structure. 063 // Each node is a versions deletion (DeleteFamily or DeleteColumn). Key is the mvcc of the marker, 064 // value is a data structure which contains infos we need that happens before this node's mvcc and 065 // after the previous node's mvcc. The last node is a special node whose key is max_long that 066 // saves infos after last deletion. See DeleteVersionsNode's comments for details. 067 // The delColMap is constructed and used for each cq, and thedelFamMap is constructed when cq is 068 // null and saving family-level delete markers. Each time the cq is changed, we should 069 // reconstruct delColMap as a deep copy of delFamMap. 070 protected NavigableMap<Long, DeleteVersionsNode> delColMap = new TreeMap<>(); 071 protected NavigableMap<Long, DeleteVersionsNode> delFamMap = new TreeMap<>(); 072 073 /** 074 * Note maxVersion and minVersion must set according to cf's conf, not user's scan parameter. 075 * 076 * @param columns columns specified user in query 077 * @param comparartor the cell comparator 078 * @param minVersion The minimum number of versions to keep(used when TTL is set). 079 * @param maxVersion The maximum number of versions in CF's conf 080 * @param resultMaxVersions maximum versions to return per column, which may be different from 081 * maxVersion 082 * @param oldestUnexpiredTS the oldest timestamp we are interested in, based on TTL 083 */ 084 public NewVersionBehaviorTracker(NavigableSet<byte[]> columns, CellComparator comparartor, 085 int minVersion, int maxVersion, int resultMaxVersions, long oldestUnexpiredTS) { 086 this.maxVersions = maxVersion; 087 this.minVersions = minVersion; 088 this.resultMaxVersions = resultMaxVersions; 089 this.oldestStamp = oldestUnexpiredTS; 090 if (columns != null && columns.size() > 0) { 091 this.columns = new byte[columns.size()][]; 092 int i = 0; 093 for (byte[] column : columns) { 094 this.columns[i++] = column; 095 } 096 } 097 this.comparator = comparartor; 098 reset(); 099 } 100 101 @Override 102 public void beforeShipped() throws IOException { 103 // Do nothing 104 } 105 106 /** 107 * A data structure which contains infos we need that happens before this node's mvcc and 108 * after the previous node's mvcc. A node means there is a version deletion at the mvcc and ts. 109 */ 110 protected class DeleteVersionsNode { 111 public long ts; 112 public long mvcc; 113 114 // <timestamp, set<mvcc>> 115 // Key is ts of version deletes, value is its mvccs. 116 // We may delete more than one time for a version. 117 private Map<Long, SortedSet<Long>> deletesMap = new HashMap<>(); 118 119 // <mvcc, set<mvcc>> 120 // Key is mvcc of version deletes, value is mvcc of visible puts before the delete effect. 121 private NavigableMap<Long, SortedSet<Long>> mvccCountingMap = new TreeMap<>(); 122 123 protected DeleteVersionsNode(long ts, long mvcc) { 124 this.ts = ts; 125 this.mvcc = mvcc; 126 mvccCountingMap.put(Long.MAX_VALUE, new TreeSet<Long>()); 127 } 128 129 protected DeleteVersionsNode() { 130 this(Long.MIN_VALUE, Long.MAX_VALUE); 131 } 132 133 public void addVersionDelete(Cell cell) { 134 SortedSet<Long> set = deletesMap.get(cell.getTimestamp()); 135 if (set == null) { 136 set = new TreeSet<>(); 137 deletesMap.put(cell.getTimestamp(), set); 138 } 139 set.add(cell.getSequenceId()); 140 // The init set should be the puts whose mvcc is smaller than this Delete. Because 141 // there may be some Puts masked by them. The Puts whose mvcc is larger than this Delete can 142 // not be copied to this node because we may delete one version and the oldest put may not be 143 // masked. 144 SortedSet<Long> nextValue = mvccCountingMap.ceilingEntry(cell.getSequenceId()).getValue(); 145 SortedSet<Long> thisValue = new TreeSet<>(nextValue.headSet(cell.getSequenceId())); 146 mvccCountingMap.put(cell.getSequenceId(), thisValue); 147 } 148 149 protected DeleteVersionsNode getDeepCopy() { 150 DeleteVersionsNode node = new DeleteVersionsNode(ts, mvcc); 151 for (Map.Entry<Long, SortedSet<Long>> e : deletesMap.entrySet()) { 152 node.deletesMap.put(e.getKey(), new TreeSet<>(e.getValue())); 153 } 154 for (Map.Entry<Long, SortedSet<Long>> e : mvccCountingMap.entrySet()) { 155 node.mvccCountingMap.put(e.getKey(), new TreeSet<>(e.getValue())); 156 } 157 return node; 158 } 159 } 160 161 /** 162 * Reset the map if it is different with the last Cell. 163 * Save the cq array/offset/length for next Cell. 164 * 165 * @return If this put has duplicate ts with last cell, return the mvcc of last cell. 166 * Else return MAX_VALUE. 167 */ 168 protected long prepare(Cell cell) { 169 boolean matchCq = 170 PrivateCellUtil.matchingQualifier(cell, lastCqArray, lastCqOffset, lastCqLength); 171 if (!matchCq) { 172 // The last cell is family-level delete and this is not, or the cq is changed, 173 // we should construct delColMap as a deep copy of delFamMap. 174 delColMap.clear(); 175 for (Map.Entry<Long, DeleteVersionsNode> e : delFamMap.entrySet()) { 176 delColMap.put(e.getKey(), e.getValue().getDeepCopy()); 177 } 178 countCurrentCol = 0; 179 } 180 if (matchCq && !PrivateCellUtil.isDelete(lastCqType) && lastCqType == cell.getTypeByte() 181 && lastCqTs == cell.getTimestamp()) { 182 // Put with duplicate timestamp, ignore. 183 return lastCqMvcc; 184 } 185 lastCqArray = cell.getQualifierArray(); 186 lastCqOffset = cell.getQualifierOffset(); 187 lastCqLength = cell.getQualifierLength(); 188 lastCqTs = cell.getTimestamp(); 189 lastCqMvcc = cell.getSequenceId(); 190 lastCqType = cell.getTypeByte(); 191 return Long.MAX_VALUE; 192 } 193 194 // DeleteTracker 195 @Override 196 public void add(Cell cell) { 197 prepare(cell); 198 byte type = cell.getTypeByte(); 199 switch (Type.codeToType(type)) { 200 // By the order of seen. We put null cq at first. 201 case DeleteFamily: // Delete all versions of all columns of the specified family 202 delFamMap.put(cell.getSequenceId(), 203 new DeleteVersionsNode(cell.getTimestamp(), cell.getSequenceId())); 204 break; 205 case DeleteFamilyVersion: // Delete all columns of the specified family and specified version 206 delFamMap.ceilingEntry(cell.getSequenceId()).getValue().addVersionDelete(cell); 207 break; 208 209 // These two kinds of markers are mix with Puts. 210 case DeleteColumn: // Delete all versions of the specified column 211 delColMap.put(cell.getSequenceId(), 212 new DeleteVersionsNode(cell.getTimestamp(), cell.getSequenceId())); 213 break; 214 case Delete: // Delete the specified version of the specified column. 215 delColMap.ceilingEntry(cell.getSequenceId()).getValue().addVersionDelete(cell); 216 break; 217 default: 218 throw new AssertionError("Unknown delete marker type for " + cell); 219 } 220 } 221 222 /** 223 * This method is not idempotent, we will save some info to judge VERSION_MASKED. 224 * @param cell - current cell to check if deleted by a previously seen delete 225 * @return We don't distinguish DeleteColumn and DeleteFamily. We only return code for column. 226 */ 227 @Override 228 public DeleteResult isDeleted(Cell cell) { 229 long duplicateMvcc = prepare(cell); 230 231 for (Map.Entry<Long, DeleteVersionsNode> e : delColMap.tailMap(cell.getSequenceId()) 232 .entrySet()) { 233 DeleteVersionsNode node = e.getValue(); 234 long deleteMvcc = Long.MAX_VALUE; 235 SortedSet<Long> deleteVersionMvccs = node.deletesMap.get(cell.getTimestamp()); 236 if (deleteVersionMvccs != null) { 237 SortedSet<Long> tail = deleteVersionMvccs.tailSet(cell.getSequenceId()); 238 if (!tail.isEmpty()) { 239 deleteMvcc = tail.first(); 240 } 241 } 242 SortedMap<Long, SortedSet<Long>> subMap = 243 node.mvccCountingMap 244 .subMap(cell.getSequenceId(), true, Math.min(duplicateMvcc, deleteMvcc), true); 245 for (Map.Entry<Long, SortedSet<Long>> seg : subMap.entrySet()) { 246 if (seg.getValue().size() >= maxVersions) { 247 return DeleteResult.VERSION_MASKED; 248 } 249 seg.getValue().add(cell.getSequenceId()); 250 } 251 if (deleteMvcc < Long.MAX_VALUE) { 252 return DeleteResult.VERSION_DELETED; 253 } 254 255 if (cell.getTimestamp() <= node.ts) { 256 return DeleteResult.COLUMN_DELETED; 257 } 258 } 259 if (duplicateMvcc < Long.MAX_VALUE) { 260 return DeleteResult.VERSION_MASKED; 261 } 262 return DeleteResult.NOT_DELETED; 263 } 264 265 @Override 266 public boolean isEmpty() { 267 return delColMap.size() == 1 && delColMap.get(Long.MAX_VALUE).mvccCountingMap.size() == 1 268 && delFamMap.size() == 1 && delFamMap.get(Long.MAX_VALUE).mvccCountingMap.size() == 1; 269 } 270 271 @Override 272 public void update() { 273 // ignore 274 } 275 276 //ColumnTracker 277 278 @Override 279 public MatchCode checkColumn(Cell cell, byte type) throws IOException { 280 if (columns == null) { 281 return MatchCode.INCLUDE; 282 } 283 284 while (!done()) { 285 int c = CellUtil.compareQualifiers(cell, 286 columns[columnIndex], 0, columns[columnIndex].length); 287 if (c < 0) { 288 return MatchCode.SEEK_NEXT_COL; 289 } 290 291 if (c == 0) { 292 // We drop old version in #isDeleted, so here we must return INCLUDE. 293 return MatchCode.INCLUDE; 294 } 295 296 columnIndex++; 297 } 298 // No more columns left, we are done with this query 299 return MatchCode.SEEK_NEXT_ROW; 300 } 301 302 @Override 303 public MatchCode checkVersions(Cell cell, long timestamp, byte type, 304 boolean ignoreCount) throws IOException { 305 assert !PrivateCellUtil.isDelete(type); 306 // We drop old version in #isDeleted, so here we won't SKIP because of versioning. But we should 307 // consider TTL. 308 if (ignoreCount) { 309 return MatchCode.INCLUDE; 310 } 311 countCurrentCol++; 312 if (timestamp < this.oldestStamp) { 313 if (countCurrentCol == minVersions) { 314 return MatchCode.INCLUDE_AND_SEEK_NEXT_COL; 315 } 316 if (countCurrentCol > minVersions) { 317 // This may not be reached, only for safety. 318 return MatchCode.SEEK_NEXT_COL; 319 } 320 } 321 322 if (countCurrentCol == resultMaxVersions) { 323 // We have enough number of versions for user's requirement. 324 return MatchCode.INCLUDE_AND_SEEK_NEXT_COL; 325 } 326 if (countCurrentCol > resultMaxVersions) { 327 // This may not be reached, only for safety 328 return MatchCode.SEEK_NEXT_COL; 329 } 330 return MatchCode.INCLUDE; 331 } 332 333 @Override 334 public void reset() { 335 delColMap.clear(); 336 delFamMap.clear(); 337 lastCqArray = null; 338 lastCqLength = 0; 339 lastCqOffset = 0; 340 lastCqTs = Long.MIN_VALUE; 341 lastCqMvcc = 0; 342 lastCqType = 0; 343 columnIndex = 0; 344 countCurrentCol = 0; 345 resetInternal(); 346 } 347 348 protected void resetInternal(){ 349 delFamMap.put(Long.MAX_VALUE, new DeleteVersionsNode()); 350 } 351 352 @Override 353 public boolean done() { 354 return columns != null && columnIndex >= columns.length; 355 } 356 357 @Override 358 public ColumnCount getColumnHint() { 359 if (columns != null) { 360 if (columnIndex < columns.length) { 361 return new ColumnCount(columns[columnIndex]); 362 } 363 } 364 return null; 365 } 366 367 @Override 368 public MatchCode getNextRowOrNextColumn(Cell cell) { 369 // TODO maybe we can optimize. 370 return MatchCode.SEEK_NEXT_COL; 371 } 372 373 @Override 374 public boolean isDone(long timestamp) { 375 // We can not skip Cells with small ts. 376 return false; 377 } 378 379 @Override 380 public CellComparator getCellComparator() { 381 return this.comparator; 382 } 383 384}