Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.function.Supplier;

import com.google.common.collect.BoundType;
import com.google.common.collect.Range;
Expand All @@ -44,9 +45,11 @@
import org.apache.calcite.rex.RexUtil;
import org.apache.calcite.rex.RexVisitorImpl;
import org.apache.calcite.sql.SqlKind;
import org.apache.calcite.sql.fun.SqlStdOperatorTable;
import org.apache.calcite.sql.type.SqlTypeName;
import org.apache.calcite.sql.type.SqlTypeUtil;
import org.apache.calcite.util.ImmutableBitSet;
import org.apache.calcite.util.Sarg;
import org.apache.datasketches.kll.KllFloatsSketch;
import org.apache.datasketches.memory.Memory;
import org.apache.datasketches.quantilescommon.QuantileSearchCriteria;
Expand All @@ -65,8 +68,10 @@ public class FilterSelectivityEstimator extends RexVisitorImpl<Double> {

protected static final Logger LOG = LoggerFactory.getLogger(FilterSelectivityEstimator.class);

private static final double DEFAULT_COMPARISON_SELECTIVITY = 1.0 / 3.0;

private final RelNode childRel;
private final double childCardinality;
private final double childCardinality;
private final RelMetadataQuery mq;
private final RexBuilder rexBuilder;

Expand Down Expand Up @@ -113,8 +118,10 @@ public Double visitCall(RexCall call) {
selectivity = computeConjunctionSelectivity(call);
break;
}
case SEARCH:
return new SearchTransformer<>(rexBuilder, call, RexUnknownAs.FALSE).transform().accept(this);
case SEARCH: {
selectivity = computeSearchSelectivity(call);
break;
}
case OR: {
selectivity = computeDisjunctionSelectivity(call);
break;
Expand Down Expand Up @@ -159,7 +166,7 @@ public Double visitCall(RexCall call) {
case GREATER_THAN_OR_EQUAL:
case LESS_THAN:
case GREATER_THAN: {
selectivity = computeRangePredicateSelectivity(call, call.getKind());
selectivity = computeComparisonPredicateSelectivity(call, call.getKind());
break;
}

Expand Down Expand Up @@ -405,8 +412,8 @@ private static Range<Float> makeRange(float lower, float upper, BoundType upperT
return lower > upper ? Range.closedOpen(0f, 0f) : Range.range(lower, BoundType.CLOSED, upper, upperType);
}

private double computeRangePredicateSelectivity(RexCall call, SqlKind op) {
double defaultSelectivity = ((double) 1 / (double) 3);
private double computeComparisonPredicateSelectivity(RexCall call, SqlKind op) {
double defaultSelectivity = DEFAULT_COMPARISON_SELECTIVITY;
if (!(childRel instanceof HiveTableScan)) {
return defaultSelectivity;
}
Expand Down Expand Up @@ -440,34 +447,56 @@ private double computeRangePredicateSelectivity(RexCall call, SqlKind op) {
boundaryValues[boundaryIdx] = value;
inclusive[boundaryIdx] = openBound ? BoundType.OPEN : BoundType.CLOSED;
Range<Float> boundaries = Range.range(boundaryValues[0], inclusive[0], boundaryValues[1], inclusive[1]);

// extract the column index from the other operator
final HiveTableScan scan = (HiveTableScan) childRel;
int inputRefOpIndex = 1 - literalOpIdx;
RexNode node = operands.get(inputRefOpIndex);
if (isRemovableCast(node, scan)) {
Range<Float> typeRange = getRangeOfType(node.getType());
boundaries = adjustRangeToType(boundaries, node.getType(), typeRange);
return computeRangePredicateSelectivity(() -> defaultSelectivity, node, boundaries);
}

private double computeRangePredicateSelectivity(Supplier<Double> defaultSelectivity, RexNode operand,
Range<Float> boundaries) {
return computeRangePredicateSelectivity(defaultSelectivity, operand, boundaries, false);
}

node = RexUtil.removeCast(node);
/**
* Computes the selectivity of an operand in a certain range trying to leverage the histogram information.
* Returns the default selectivity if the histogram is not available.
*/
private double computeRangePredicateSelectivity(Supplier<Double> defaultSelectivity, RexNode operand,
Range<Float> boundaries, boolean inverseBool /* true only for NOT_BETWEEN */) {
if (!(childRel instanceof HiveTableScan)) {
return defaultSelectivity.get();
}

final HiveTableScan scan = (HiveTableScan) childRel;
Range<Float> typeRange = inverseBool ? Range.closed(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY) : null;
if (isRemovableCast(operand, scan)) {
typeRange = getRangeOfType(operand.getType());
boundaries = adjustRangeToType(boundaries, operand.getType(), typeRange);
operand = RexUtil.removeCast(operand);
}

int inputRefIndex = -1;
if (node.getKind().equals(SqlKind.INPUT_REF)) {
inputRefIndex = ((RexInputRef) node).getIndex();
if (operand.getKind().equals(SqlKind.INPUT_REF)) {
inputRefIndex = ((RexInputRef) operand).getIndex();
}

if (inputRefIndex < 0) {
return defaultSelectivity;
return defaultSelectivity.get();
}

final List<ColStatistics> colStats = scan.getColStat(Collections.singletonList(inputRefIndex));
if (colStats.isEmpty() || !isHistogramAvailable(colStats.get(0))) {
return defaultSelectivity;
return defaultSelectivity.get();
}

final KllFloatsSketch kll = KllFloatsSketch.heapify(Memory.wrap(colStats.get(0).getHistogram()));
double rawSelectivity = rangedSelectivity(kll, boundaries);
if (inverseBool) {
// when inverseBool == true, this is a NOT_BETWEEN and selectivity must be inverted
// if there's a cast, the inversion is with respect to its codomain (range of the values of the cast)
double typeRangeSelectivity = rangedSelectivity(kll, typeRange);
rawSelectivity = typeRangeSelectivity - rawSelectivity;
}
return scaleSelectivityToNullableValues(kll, rawSelectivity, scan);
}

Expand Down Expand Up @@ -511,7 +540,6 @@ private Double computeBetweenPredicateSelectivity(RexCall call) {
Optional<Float> rightLiteral = extractLiteral(operands.get(3));

if (hasLiteralBool && leftLiteral.isPresent() && rightLiteral.isPresent()) {
final HiveTableScan scan = (HiveTableScan) childRel;
float leftValue = leftLiteral.get();
float rightValue = rightLiteral.get();

Expand All @@ -522,36 +550,9 @@ private Double computeBetweenPredicateSelectivity(RexCall call) {
}

Range<Float> rangeBoundaries = makeRange(leftValue, rightValue, BoundType.CLOSED);
Range<Float> typeBoundaries = inverseBool ? Range.closed(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY) : null;

RexNode expr = operands.get(1); // expr to be checked by the BETWEEN
if (isRemovableCast(expr, scan)) {
typeBoundaries = getRangeOfType(expr.getType());
rangeBoundaries = adjustRangeToType(rangeBoundaries, expr.getType(), typeBoundaries);
expr = RexUtil.removeCast(expr);
}

int inputRefIndex = -1;
if (expr.getKind().equals(SqlKind.INPUT_REF)) {
inputRefIndex = ((RexInputRef) expr).getIndex();
}

if (inputRefIndex < 0) {
return computeFunctionSelectivity(call);
}

final List<ColStatistics> colStats = scan.getColStat(Collections.singletonList(inputRefIndex));
if (!colStats.isEmpty() && isHistogramAvailable(colStats.get(0))) {
final KllFloatsSketch kll = KllFloatsSketch.heapify(Memory.wrap(colStats.get(0).getHistogram()));
double rawSelectivity = rangedSelectivity(kll, rangeBoundaries);
if (inverseBool) {
// when inverseBool == true, this is a NOT_BETWEEN and selectivity must be inverted
// if there's a cast, the inversion is with respect to its codomain (range of the values of the cast)
double typeRangeSelectivity = rangedSelectivity(kll, typeBoundaries);
rawSelectivity = typeRangeSelectivity - rawSelectivity;
}
return scaleSelectivityToNullableValues(kll, rawSelectivity, scan);
}
return computeRangePredicateSelectivity(() -> computeFunctionSelectivity(call), expr, rangeBoundaries,
inverseBool);
}
return computeFunctionSelectivity(call);
}
Expand Down Expand Up @@ -603,6 +604,101 @@ private Optional<Float> extractLiteral(SqlTypeName typeName, Object boundValueOb
return Optional.of(value);
}

private double computeSearchSelectivity(RexCall search) {
return new SearchSelectivityHelper<>(search).compute();
}

/**
* Auxiliary class to compute the selectivity of a SEARCH expression.
*/
private final class SearchSelectivityHelper<C extends Comparable<C>> {
private final RexNode ref;
private final Sarg<C> sarg;
private final RelDataType operandType;

private SearchSelectivityHelper(RexCall search) {
ref = search.getOperands().get(0);
RexLiteral literal = (RexLiteral) search.operands.get(1);
sarg = Objects.requireNonNull(literal.getValueAs(Sarg.class), "Sarg");
operandType = literal.getType();
}

private RexNode makeLiteral(C value) {
return rexBuilder.makeLiteral(value, operandType, true, true);
}

private double compute() {
final List<Double> selectivityList = new ArrayList<>();
final List<RexNode> inLiterals = new ArrayList<>();

double rangesSelectivity = 0d;
// accumulate the selectivity of all ranges
for (Range<C> range : sarg.rangeSet.asRanges()) {
if (!range.hasLowerBound() && !range.hasUpperBound()) {
return 1.0; // "all" range
}

final BoundType lowerBoundType, upperBoundType;
final Optional<Float> lowerLiteral, upperLiteral;
final Supplier<Double> defaultSelectivity;
if (range.hasLowerBound() && range.hasUpperBound()) {
C lower = range.lowerEndpoint();
C upper = range.upperEndpoint();
lowerBoundType = range.lowerBoundType();
upperBoundType = range.upperBoundType();
if (lower.equals(upper) && lowerBoundType == BoundType.CLOSED && upperBoundType == BoundType.CLOSED) {
// range represents a single value: save it for later
inLiterals.add(makeLiteral(lower));
continue;
}
RexNode lowerRexLiteral = makeLiteral(lower);
RexNode upperRexLiteral = makeLiteral(upper);
lowerLiteral = extractLiteral(lowerRexLiteral);
upperLiteral = extractLiteral(upperRexLiteral);
defaultSelectivity = () -> computeFunctionSelectivity(List.of(ref, lowerRexLiteral, upperRexLiteral));
} else if (range.hasLowerBound()) {
lowerLiteral = extractLiteral(makeLiteral(range.lowerEndpoint()));
lowerBoundType = range.lowerBoundType();
upperLiteral = Optional.of(Float.POSITIVE_INFINITY);
upperBoundType = BoundType.CLOSED;
defaultSelectivity = () -> DEFAULT_COMPARISON_SELECTIVITY;
} else { // i.e. range.hasUpperBound()
upperLiteral = extractLiteral(makeLiteral(range.upperEndpoint()));
upperBoundType = range.upperBoundType();
lowerLiteral = Optional.of(Float.NEGATIVE_INFINITY);
lowerBoundType = BoundType.CLOSED;
defaultSelectivity = () -> DEFAULT_COMPARISON_SELECTIVITY;
}

double currentRangeSelectivity = lowerLiteral.isEmpty() || upperLiteral.isEmpty()
? defaultSelectivity.get()
: computeRangePredicateSelectivity(defaultSelectivity, ref,
Range.range(lowerLiteral.get(), lowerBoundType, upperLiteral.get(), upperBoundType));
rangesSelectivity = Math.min(1.0, rangesSelectivity + currentRangeSelectivity);
}
selectivityList.add(rangesSelectivity);

if (!inLiterals.isEmpty()) {
if (inLiterals.size() == 1) {
selectivityList.add(rexBuilder.makeCall(SqlStdOperatorTable.EQUALS, ref, inLiterals.get(0))
.accept(FilterSelectivityEstimator.this));
} else {
List<RexNode> operands = new ArrayList<>(inLiterals.size() + 1);
operands.add(ref);
operands.addAll(inLiterals);
selectivityList.add(rexBuilder.makeCall(HiveIn.INSTANCE, operands).accept(FilterSelectivityEstimator.this));
}
}

if (sarg.nullAs == RexUnknownAs.TRUE) {
selectivityList.add(
rexBuilder.makeCall(SqlStdOperatorTable.IS_NULL, ref).accept(FilterSelectivityEstimator.this));
}

return selectivityList.size() == 1 ? selectivityList.get(0) : computeDisjunctionSelectivity(selectivityList);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as I understand computeDisjunctionSelectivity, it combines the selectivities by multiplying their complements. To use the information from the histogram, we would need to add the selectivities (but making sure we stay within the possible values [0,1]).
I wonder how accurate estimating selectivity of individual values are compared to the ranges. This also depends on the type of histogram (I had created a ticket about the shortcoming of KLL) . I think we can just add the selectivities for individual values and ranges together, and revisit this in the future in case of need.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed. I have applied the change to add the ranges selectivities (and it gives better results in certain tests, as expected).
I have kept the "disjunction logic" for the combination with the other expressions (EQ/IN, IS_NULL), since it seems more aligned with how this class works in general when computing OR-combined RexNodes. But I agree this can be revisits in the future.

}
}

/**
* NDV of "f1(x, y, z) != f2(p, q, r)" ->
* "(maxNDV(x,y,z,p,q,r) - 1)/maxNDV(x,y,z,p,q,r)".
Expand Down Expand Up @@ -633,7 +729,11 @@ private Double computeNotEqualitySelectivity(RexCall call) {
* @return
*/
private Double computeFunctionSelectivity(RexCall call) {
Double tmpNDV = getMaxNDV(call);
return computeFunctionSelectivity(call.getOperands());
}

private Double computeFunctionSelectivity(List<RexNode> operands) {
Double tmpNDV = getMaxNDV(operands);
if (tmpNDV == null) {
// Could not be computed
return null;
Expand All @@ -653,12 +753,20 @@ private Double computeFunctionSelectivity(RexCall call) {
* @return
*/
private Double computeDisjunctionSelectivity(RexCall call) {
List<Double> selectivityList = new ArrayList<>(call.getOperands().size());
for (RexNode dje : call.getOperands()) {
selectivityList.add(dje.accept(this));
}
return computeDisjunctionSelectivity(selectivityList);
}

private double computeDisjunctionSelectivity(List<Double> selectivityList) {
Double tmpCardinality;
Double tmpSelectivity;
double selectivity = 1;

for (RexNode dje : call.getOperands()) {
tmpSelectivity = dje.accept(this);
for (Double sel : selectivityList) {
tmpSelectivity = sel;
if (tmpSelectivity == null) {
tmpSelectivity = 0.99;
}
Expand Down Expand Up @@ -729,10 +837,14 @@ private long getMaxNulls(RexCall call, HiveTableScan t) {
}

private Double getMaxNDV(RexCall call) {
return getMaxNDV(call.getOperands());
}

private Double getMaxNDV(List<RexNode> operands) {
Double tmpNDV;
double maxNDV = 1.0;
InputReferencedVisitor irv;
for (RexNode op : call.getOperands()) {
for (RexNode op : operands) {
if (op instanceof RexInputRef) {
tmpNDV = HiveRelMdDistinctRowCount.getDistinctRowCount(this.childRel, mq,
((RexInputRef) op).getIndex());
Expand Down
Loading
Loading