diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index abfe6170217e..1de391c06079 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -2013,6 +2013,11 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, } } + // If PK-FK inference failed, check for NDV=0 on join keys and use joinFactor heuristic + if (inferredRowCount == -1 && hasZeroNdvJoinKey(joinKeys, joinStats)) { + inferredRowCount = computeJoinFactorEstimate(conf, Collections.max(rowCounts), rowCounts.size()); + } + List distinctVals = Lists.newArrayList(); // these ndvs are later used to compute unmatched rows and num of nulls for outer joins @@ -2136,7 +2141,6 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, } else { // worst case when there are no column statistics - float joinFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_JOIN_FACTOR); int numParents = parents.size(); long crossRowCount = 1; long crossDataSize = 1; @@ -2182,14 +2186,8 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, newNumRows = crossRowCount; newDataSize = crossDataSize; } else { - if (numParents > 1) { - newNumRows = StatsUtils.safeMult(StatsUtils.safeMult(maxRowCount, (numParents - 1)), joinFactor); - newDataSize = StatsUtils.safeMult(StatsUtils.safeMult(maxDataSize, (numParents - 1)), joinFactor); - } else { - // MUX operator with 1 parent - newNumRows = StatsUtils.safeMult(maxRowCount, joinFactor); - newDataSize = StatsUtils.safeMult(maxDataSize, joinFactor); - } + newNumRows = computeJoinFactorEstimate(conf, maxRowCount, numParents); + newDataSize = computeJoinFactorEstimate(conf, maxDataSize, numParents); } Statistics wcStats = new Statistics(newNumRows, newDataSize, 0, 0); @@ -2748,6 +2746,29 @@ private long computeRowCountAssumingInnerJoin(List rowCountParents, long d return result; } + @VisibleForTesting + static long computeJoinFactorEstimate(HiveConf conf, long maxValue, int numParents) { + float joinFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_JOIN_FACTOR); + if (numParents > 1) { + return StatsUtils.safeMult(StatsUtils.safeMult(maxValue, (numParents - 1)), joinFactor); + } + return StatsUtils.safeMult(maxValue, joinFactor); + } + + @VisibleForTesting + static boolean hasZeroNdvJoinKey(Map> joinKeys, + Map joinStats) { + return joinKeys.entrySet().stream().anyMatch(entry -> { + Statistics posStats = joinStats.get(entry.getKey()); + if (posStats.getNumRows() <= 1) { + return false; + } + return entry.getValue().stream() + .map(posStats::getColumnStatisticsFromColName) + .anyMatch(cs -> cs != null && cs.getCountDistint() == 0L); + }); + } + private void updateJoinColumnsNDV(Map> joinKeys, Map joinStats, int numAttr) { int joinColIdx = 0; diff --git a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/stats/annotation/TestStatsRulesProcFactory.java b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/stats/annotation/TestStatsRulesProcFactory.java index 4d9d351af8f1..11b6e78b8e58 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/stats/annotation/TestStatsRulesProcFactory.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/stats/annotation/TestStatsRulesProcFactory.java @@ -37,24 +37,47 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.ql.Context; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.CommonJoinOperator; +import org.apache.hadoop.hive.ql.exec.JoinOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.plan.AggregationDesc; +import org.apache.hadoop.hive.ql.plan.JoinCondDesc; +import org.apache.hadoop.hive.ql.plan.JoinDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; +import org.apache.hadoop.hive.ql.plan.mapper.PlanMapper; +import org.apache.hadoop.hive.ql.plan.mapper.StatsSource; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; -import org.junit.Test; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Stack; +import java.util.stream.Stream; -import org.apache.hadoop.hive.ql.exec.CommonJoinOperator; -import org.apache.hadoop.hive.ql.plan.JoinCondDesc; -import org.apache.hadoop.hive.ql.plan.JoinDesc; +import static org.apache.hadoop.hive.ql.optimizer.stats.annotation.StatsRulesProcFactory.FilterStatsRule.extractFloatFromLiteralValue; + +import org.apache.hadoop.hive.ql.optimizer.stats.annotation.StatsRulesProcFactory.JoinStatsRule; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.doAnswer; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; -import static org.apache.hadoop.hive.ql.optimizer.stats.annotation.StatsRulesProcFactory.FilterStatsRule.extractFloatFromLiteralValue; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertThrows; - public class TestStatsRulesProcFactory { private final static String COL_NAME = "col1"; @@ -66,7 +89,7 @@ public class TestStatsRulesProcFactory { private final static long[] VALUES = { 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 4L, 5L, 6L, 7L }; @Test - public void testComparisonRowCountZeroNonNullValues() throws SemanticException { + void testComparisonRowCountZeroNonNullValues() throws SemanticException { long numNulls = 2; long[] values = {}; Statistics stats = createStatistics(values, numNulls); @@ -80,7 +103,7 @@ public void testComparisonRowCountZeroNonNullValues() throws SemanticException { } @Test - public void testComparisonRowCountInvalidKll() throws SemanticException { + void testComparisonRowCountInvalidKll() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); stats.getColumnStats().get(0).setHistogram(null); @@ -103,7 +126,7 @@ public void testComparisonRowCountInvalidKll() throws SemanticException { } @Test - public void testComparisonRowCountLessThan() throws SemanticException { + void testComparisonRowCountLessThan() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -116,7 +139,7 @@ public void testComparisonRowCountLessThan() throws SemanticException { } @Test - public void testComparisonRowCountLessThanMin() throws SemanticException { + void testComparisonRowCountLessThanMin() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -129,7 +152,7 @@ public void testComparisonRowCountLessThanMin() throws SemanticException { } @Test - public void testComparisonRowCountLessThanBelowMin() throws SemanticException { + void testComparisonRowCountLessThanBelowMin() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -142,7 +165,7 @@ public void testComparisonRowCountLessThanBelowMin() throws SemanticException { } @Test - public void testComparisonRowCountLessThanMax() throws SemanticException { + void testComparisonRowCountLessThanMax() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -155,7 +178,7 @@ public void testComparisonRowCountLessThanMax() throws SemanticException { } @Test - public void testComparisonRowCountLessThanAboveMax() throws SemanticException { + void testComparisonRowCountLessThanAboveMax() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -168,7 +191,7 @@ public void testComparisonRowCountLessThanAboveMax() throws SemanticException { } @Test - public void testComparisonRowCountEqualOrLessThan() throws SemanticException { + void testComparisonRowCountEqualOrLessThan() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, @@ -180,7 +203,7 @@ public void testComparisonRowCountEqualOrLessThan() throws SemanticException { } @Test - public void testComparisonRowCountEqualOrLessThanMin() throws SemanticException { + void testComparisonRowCountEqualOrLessThanMin() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, @@ -192,7 +215,7 @@ public void testComparisonRowCountEqualOrLessThanMin() throws SemanticException } @Test - public void testComparisonRowCountEqualOrLessThanBelowMin() throws SemanticException { + void testComparisonRowCountEqualOrLessThanBelowMin() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, @@ -204,7 +227,7 @@ public void testComparisonRowCountEqualOrLessThanBelowMin() throws SemanticExcep } @Test - public void testComparisonRowCountEqualOrLessThanMax() throws SemanticException { + void testComparisonRowCountEqualOrLessThanMax() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, @@ -216,7 +239,7 @@ public void testComparisonRowCountEqualOrLessThanMax() throws SemanticException } @Test - public void testComparisonRowCountEqualOrLessThanAboveMax() throws SemanticException { + void testComparisonRowCountEqualOrLessThanAboveMax() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, @@ -228,7 +251,7 @@ public void testComparisonRowCountEqualOrLessThanAboveMax() throws SemanticExcep } @Test - public void testComparisonRowCountGreaterThan() throws SemanticException { + void testComparisonRowCountGreaterThan() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, @@ -240,7 +263,7 @@ public void testComparisonRowCountGreaterThan() throws SemanticException { } @Test - public void testComparisonRowCountGreaterThanMin() throws SemanticException { + void testComparisonRowCountGreaterThanMin() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, @@ -252,7 +275,7 @@ public void testComparisonRowCountGreaterThanMin() throws SemanticException { } @Test - public void testComparisonRowCountGreaterThanBelowMin() throws SemanticException { + void testComparisonRowCountGreaterThanBelowMin() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, @@ -264,7 +287,7 @@ public void testComparisonRowCountGreaterThanBelowMin() throws SemanticException } @Test - public void testComparisonRowCountGreaterThanMax() throws SemanticException { + void testComparisonRowCountGreaterThanMax() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, @@ -276,7 +299,7 @@ public void testComparisonRowCountGreaterThanMax() throws SemanticException { } @Test - public void testComparisonRowCountGreaterThanAboveMax() throws SemanticException { + void testComparisonRowCountGreaterThanAboveMax() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, @@ -288,7 +311,7 @@ public void testComparisonRowCountGreaterThanAboveMax() throws SemanticException } @Test - public void testComparisonRowCountEqualOrGreaterThan() throws SemanticException { + void testComparisonRowCountEqualOrGreaterThan() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, @@ -300,7 +323,7 @@ public void testComparisonRowCountEqualOrGreaterThan() throws SemanticException } @Test - public void testComparisonRowCountEqualOrGreaterThanMin() throws SemanticException { + void testComparisonRowCountEqualOrGreaterThanMin() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, @@ -312,7 +335,7 @@ public void testComparisonRowCountEqualOrGreaterThanMin() throws SemanticExcepti } @Test - public void testComparisonRowCountEqualOrGreaterThanBelowMin() throws SemanticException { + void testComparisonRowCountEqualOrGreaterThanBelowMin() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, @@ -324,7 +347,7 @@ public void testComparisonRowCountEqualOrGreaterThanBelowMin() throws SemanticEx } @Test - public void testComparisonRowCountEqualOrGreaterThanMax() throws SemanticException { + void testComparisonRowCountEqualOrGreaterThanMax() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, @@ -336,7 +359,7 @@ public void testComparisonRowCountEqualOrGreaterThanMax() throws SemanticExcepti } @Test - public void testComparisonRowCountEqualOrGreaterThanBeyondMax() throws SemanticException { + void testComparisonRowCountEqualOrGreaterThanBeyondMax() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); ExprNodeDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, @@ -348,7 +371,7 @@ public void testComparisonRowCountEqualOrGreaterThanBeyondMax() throws SemanticE } @Test - public void testComparisonRowCountEqualOrLessThanWhenMinEqualMax() throws SemanticException { + void testComparisonRowCountEqualOrLessThanWhenMinEqualMax() throws SemanticException { long[] values = { 1L, 1L }; long numNulls = 2; Statistics stats = createStatistics(values, numNulls); @@ -362,7 +385,7 @@ public void testComparisonRowCountEqualOrLessThanWhenMinEqualMax() throws Semant } @Test - public void testComparisonRowCountEqualOrGreaterThanWhenMinEqualMax() throws SemanticException { + void testComparisonRowCountEqualOrGreaterThanWhenMinEqualMax() throws SemanticException { long[] values = { 1L, 1L }; long numNulls = 2; Statistics stats = createStatistics(values, numNulls); @@ -376,7 +399,7 @@ public void testComparisonRowCountEqualOrGreaterThanWhenMinEqualMax() throws Sem } @Test - public void testBetween() throws SemanticException { + void testBetween() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -390,7 +413,7 @@ public void testBetween() throws SemanticException { } @Test - public void testLiteralExtraction() { + void testLiteralExtraction() { final double DELTA = 1e-5; assertEquals((float) 100, @@ -420,7 +443,7 @@ public void testLiteralExtraction() { } @Test - public void testLiteralExtractionFailures() { + void testLiteralExtractionFailures() { // make sure the correct exceptions are raised so that we can default to standard computation String[] types = {"int", "tinyint", "smallint", "bigint", "date", "timestamp", "float", "double"}; for (String type : types) { @@ -437,7 +460,7 @@ public void testLiteralExtractionFailures() { } @Test - public void testBetweenLeftLowerThanMin() throws SemanticException { + void testBetweenLeftLowerThanMin() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -457,7 +480,7 @@ public void testBetweenLeftLowerThanMin() throws SemanticException { } @Test - public void testBetweenLeftLowerThanMinRightHigherThanMax() throws SemanticException { + void testBetweenLeftLowerThanMinRightHigherThanMax() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -471,7 +494,7 @@ public void testBetweenLeftLowerThanMinRightHigherThanMax() throws SemanticExcep } @Test - public void testBetweenRightHigherThanMax() throws SemanticException { + void testBetweenRightHigherThanMax() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -491,7 +514,7 @@ public void testBetweenRightHigherThanMax() throws SemanticException { } @Test - public void testBetweenRightLowerThanMin() throws SemanticException { + void testBetweenRightLowerThanMin() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -505,7 +528,7 @@ public void testBetweenRightLowerThanMin() throws SemanticException { } @Test - public void testBetweenLeftHigherThanMax() throws SemanticException { + void testBetweenLeftHigherThanMax() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -519,7 +542,7 @@ public void testBetweenLeftHigherThanMax() throws SemanticException { } @Test - public void testBetweenLeftEqualMax() throws SemanticException { + void testBetweenLeftEqualMax() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -533,7 +556,7 @@ public void testBetweenLeftEqualMax() throws SemanticException { } @Test - public void testNotBetween() throws SemanticException { + void testNotBetween() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -558,7 +581,7 @@ public void testNotBetween() throws SemanticException { } @Test - public void testNotBetweenLowerThanMinHigherThanMax() throws SemanticException { + void testNotBetweenLowerThanMinHigherThanMax() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -572,7 +595,7 @@ public void testNotBetweenLowerThanMinHigherThanMax() throws SemanticException { } @Test - public void testNotBetweenLeftEqualsRight() throws SemanticException { + void testNotBetweenLeftEqualsRight() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -586,7 +609,7 @@ public void testNotBetweenLeftEqualsRight() throws SemanticException { } @Test - public void testNotBetweenRightLowerThanLeft() throws SemanticException { + void testNotBetweenRightLowerThanLeft() throws SemanticException { long numNulls = 2; Statistics stats = createStatistics(VALUES, numNulls); @@ -599,56 +622,13 @@ public void testNotBetweenRightLowerThanLeft() throws SemanticException { assertEquals(VALUES.length, numRows); } - private ExprNodeDesc createExprNodeConstantDesc(int value) { - return new ExprNodeConstantDesc(TypeInfoFactory.intTypeInfo, value); - } - - private Statistics createStatistics(long[] values, long numNulls) { - long numDVs = Arrays.stream(values).distinct().count(); - Statistics stats = new Statistics(values.length + numNulls, 100, 100, 100); - - HyperLogLog hll = StatisticsTestUtils.createHll(values); - float[] val = new float[values.length]; - for (int i = 0; i < values.length; i++) { - val[i] = values[i]; - } - KllFloatsSketch kll = StatisticsTestUtils.createKll(val); - ColStatistics colStatistics = createColStatistics(COL_NAME, "int", numNulls, numDVs, hll, kll); - - stats.addToColumnStats(Collections.singletonList(colStatistics)); - - return stats; - } - - private static ColStatistics createColStatistics( - String colName, String colType, long numNulls, long numDVs, Object hll, KllFloatsSketch kll) { - ColStatistics colStatistics = new ColStatistics(colName, colType); - - colStatistics.setNumNulls(numNulls); - colStatistics.setCountDistint(numDVs); - if (hll != null) { - if (hll instanceof HyperLogLog) { - colStatistics.setBitVectors(((HyperLogLog) hll).serialize()); - } else if (hll instanceof FMSketch) { - colStatistics.setBitVectors(((FMSketch) hll).serialize()); - } else { - throw new IllegalArgumentException("Unsupported HLL class: " + hll.getClass().getName()); - } - } - if (kll != null) { - colStatistics.setHistogram(kll.toByteArray()); - } - - return colStatistics; - } - /** * Test that computeAggregateColumnMinMax properly handles numNulls=-1 (unknown). * With the fix, numNulls=-1 should be treated as 0, giving valuesCount = numRows. * Without the fix, valuesCount = numRows - (-1) = numRows + 1 (wrong). */ @Test - public void testComputeAggregateColumnMinMaxWithUnknownNumNulls() throws SemanticException { + void testComputeAggregateColumnMinMaxWithUnknownNumNulls() throws SemanticException { ColStatistics cs = new ColStatistics("_col0", "bigint"); HiveConf conf = new HiveConf(); @@ -676,14 +656,14 @@ public void testComputeAggregateColumnMinMaxWithUnknownNumNulls() throws Semanti // Verify: With the fix, COUNT Range should be (0, 100) // numNulls=-1 is treated as 0, so valuesCount = 100 - 0 = 100 // Without the fix, valuesCount = 100 - (-1) = 101 (WRONG) - assertNotNull("Range should be set on COUNT column", cs.getRange()); - assertEquals("COUNT min should be 0", 0L, ((Number) cs.getRange().minValue).longValue()); - assertEquals("COUNT max should be 100 (numRows), not 101", - 100L, ((Number) cs.getRange().maxValue).longValue()); + assertNotNull(cs.getRange(), "Range should be set on COUNT column"); + assertEquals(0L, ((Number) cs.getRange().minValue).longValue(), "COUNT min should be 0"); + assertEquals(100L, ((Number) cs.getRange().maxValue).longValue(), + "COUNT max should be 100 (numRows), not 101"); } @Test - public void testComputeAggregateColumnMinMaxWithKnownNumNulls() throws SemanticException { + void testComputeAggregateColumnMinMaxWithKnownNumNulls() throws SemanticException { ColStatistics cs = new ColStatistics("_col0", "bigint"); HiveConf conf = new HiveConf(); @@ -708,10 +688,10 @@ public void testComputeAggregateColumnMinMaxWithKnownNumNulls() throws SemanticE cs, conf, agg, "bigint", parentStats); // With known numNulls=20, valuesCount = 100 - 20 = 80 - assertNotNull("Range should be set", cs.getRange()); + assertNotNull(cs.getRange(), "Range should be set"); assertEquals(0L, ((Number) cs.getRange().minValue).longValue()); - assertEquals("COUNT max should be 80 (numRows - numNulls)", - 80L, ((Number) cs.getRange().maxValue).longValue()); + assertEquals(80L, ((Number) cs.getRange().maxValue).longValue(), + "COUNT max should be 80 (numRows - numNulls)"); } /** @@ -720,7 +700,7 @@ public void testComputeAggregateColumnMinMaxWithKnownNumNulls() throws SemanticE * Without the fix, LEFT_OUTER_JOIN would calculate: newNumNulls = oldNumNulls + leftUnmatchedRows = -1 + 100 = 99 */ @Test - public void testUpdateNumNullsPreservesUnknownNumNulls() { + void testUpdateNumNullsPreservesUnknownNumNulls() { StatsRulesProcFactory.JoinStatsRule joinStatsRule = new StatsRulesProcFactory.JoinStatsRule(); // Create ColStatistics with numNulls = -1 (unknown) @@ -749,7 +729,262 @@ public void testUpdateNumNullsPreservesUnknownNumNulls() { joinStatsRule.updateNumNulls(colStats, 100L, 100L, 1000L, 0L, mockJop); // Assert that numNulls is still -1 (unchanged) - assertEquals("Unknown numNulls (-1) should be preserved after updateNumNulls", - -1L, colStats.getNumNulls()); + assertEquals(-1L, colStats.getNumNulls(), + "Unknown numNulls (-1) should be preserved after updateNumNulls"); + } + + @ParameterizedTest(name = "{0}") + @MethodSource("joinFactorEstimateTestData") + void testComputeJoinFactorEstimate(String scenario, long maxValue, int numParents, long expected) { + HiveConf conf = new HiveConf(); + assertEquals(expected, JoinStatsRule.computeJoinFactorEstimate(conf, maxValue, numParents)); + } + + @ParameterizedTest(name = "{0}") + @MethodSource("hasZeroNdvJoinKeyTestData") + void testHasZeroNdvJoinKey(String scenario, Map> joinKeys, + Map joinStats, boolean expected) { + assertEquals(expected, JoinStatsRule.hasZeroNdvJoinKey(joinKeys, joinStats)); + } + + @Test + @SuppressWarnings("unchecked") + void testJoinStatsRuleWithZeroNdv() throws SemanticException { + HiveConf conf = new HiveConf(); + PlanMapper planMapper = mock(PlanMapper.class); + StatsSource statsSource = mock(StatsSource.class); + Context context = mock(Context.class); + when(context.getConf()).thenReturn(conf); + when(context.getPlanMapper()).thenReturn(planMapper); + when(context.getStatsSource()).thenReturn(statsSource); + ParseContext pctx = mock(ParseContext.class); + when(pctx.getConf()).thenReturn(conf); + when(pctx.getContext()).thenReturn(context); + AnnotateStatsProcCtx ctx = new AnnotateStatsProcCtx(pctx); + + Statistics leftStats = new Statistics(1000L, 10000L, 0L, 0L); + leftStats.setBasicStatsState(Statistics.State.COMPLETE); + leftStats.setColumnStatsState(Statistics.State.COMPLETE); + ColStatistics leftColStats = new ColStatistics("KEY.key", "int"); + leftColStats.setCountDistint(0L); + leftColStats.setNumNulls(0L); + leftStats.addToColumnStats(Collections.singletonList(leftColStats)); + + Statistics rightStats = new Statistics(500L, 5000L, 0L, 0L); + rightStats.setBasicStatsState(Statistics.State.COMPLETE); + rightStats.setColumnStatsState(Statistics.State.COMPLETE); + ColStatistics rightColStats = new ColStatistics("KEY.key", "int"); + rightColStats.setCountDistint(100L); + rightColStats.setNumNulls(0L); + rightStats.addToColumnStats(Collections.singletonList(rightColStats)); + + ReduceSinkOperator leftRsOp = mock(ReduceSinkOperator.class); + ReduceSinkDesc leftRsDesc = mock(ReduceSinkDesc.class); + when(leftRsOp.getStatistics()).thenReturn(leftStats); + when(leftRsOp.getConf()).thenReturn(leftRsDesc); + when(leftRsDesc.getOutputKeyColumnNames()).thenReturn(Arrays.asList("key")); + + ReduceSinkOperator rightRsOp = mock(ReduceSinkOperator.class); + ReduceSinkDesc rightRsDesc = mock(ReduceSinkDesc.class); + when(rightRsOp.getStatistics()).thenReturn(rightStats); + when(rightRsOp.getConf()).thenReturn(rightRsDesc); + when(rightRsDesc.getOutputKeyColumnNames()).thenReturn(Arrays.asList("key")); + + List> parents = new ArrayList<>(); + parents.add(leftRsOp); + parents.add(rightRsOp); + + JoinOperator joinOp = mock(JoinOperator.class); + JoinDesc joinDesc = mock(JoinDesc.class); + JoinCondDesc joinCond = new JoinCondDesc(0, 1, JoinDesc.INNER_JOIN); + when(joinOp.getParentOperators()).thenReturn(parents); + when(joinOp.getConf()).thenReturn(joinDesc); + when(joinDesc.getConds()).thenReturn(new JoinCondDesc[]{joinCond}); + + RowSchema rowSchema = mock(RowSchema.class); + ColumnInfo colInfo = new ColumnInfo("key", TypeInfoFactory.intTypeInfo, "", false); + when(rowSchema.getSignature()).thenReturn(Arrays.asList(colInfo)); + when(joinOp.getSchema()).thenReturn(rowSchema); + + final Statistics[] capturedStats = new Statistics[1]; + doAnswer(invocation -> { + capturedStats[0] = invocation.getArgument(0); + return null; + }).when(joinOp).setStatistics(any(Statistics.class)); + + StatsRulesProcFactory.JoinStatsRule rule = new StatsRulesProcFactory.JoinStatsRule(); + rule.process(joinOp, new Stack<>(), ctx); + + assertNotNull(capturedStats[0], "Statistics should have been set on join operator"); + assertEquals(Statistics.State.COMPLETE, capturedStats[0].getColumnStatsState(), + "Column stats state should be COMPLETE when using NDV=0 fallback"); + assertEquals(1100L, capturedStats[0].getNumRows(), + "Row count should use joinFactor heuristic: max(1000,500) * 1.1 = 1100"); + } + + private ExprNodeDesc createExprNodeConstantDesc(int value) { + return new ExprNodeConstantDesc(TypeInfoFactory.intTypeInfo, value); + } + + private Statistics createStatistics(long[] values, long numNulls) { + long numDVs = Arrays.stream(values).distinct().count(); + Statistics stats = new Statistics(values.length + numNulls, 100, 100, 100); + + HyperLogLog hll = StatisticsTestUtils.createHll(values); + float[] val = new float[values.length]; + for (int i = 0; i < values.length; i++) { + val[i] = values[i]; + } + KllFloatsSketch kll = StatisticsTestUtils.createKll(val); + ColStatistics colStatistics = createColStatistics(COL_NAME, "int", numNulls, numDVs, hll, kll); + + stats.addToColumnStats(Collections.singletonList(colStatistics)); + + return stats; + } + + private static ColStatistics createColStatistics( + String colName, String colType, long numNulls, long numDVs, Object hll, KllFloatsSketch kll) { + ColStatistics colStatistics = new ColStatistics(colName, colType); + + colStatistics.setNumNulls(numNulls); + colStatistics.setCountDistint(numDVs); + if (hll != null) { + if (hll instanceof HyperLogLog) { + colStatistics.setBitVectors(((HyperLogLog) hll).serialize()); + } else if (hll instanceof FMSketch) { + colStatistics.setBitVectors(((FMSketch) hll).serialize()); + } else { + throw new IllegalArgumentException("Unsupported HLL class: " + hll.getClass().getName()); + } + } + if (kll != null) { + colStatistics.setHistogram(kll.toByteArray()); + } + + return colStatistics; + } + + static Stream joinFactorEstimateTestData() { + return Stream.of( + Arguments.of("SingleParent", 1000L, 1, 1100L), + Arguments.of("TwoParents", 1000L, 2, 1100L), + Arguments.of("ThreeParents", 1000L, 3, 2200L), + Arguments.of("Overflow", Long.MAX_VALUE, 3, Long.MAX_VALUE) + ); + } + + static Stream hasZeroNdvJoinKeyTestData() { + // Helper to create Statistics with given row count + java.util.function.Function statsWithRows = rows -> { + Statistics s = new Statistics(rows, 100L, 0L, 0L); + s.setColumnStatsState(Statistics.State.COMPLETE); + return s; + }; + + // Helper to create ColStatistics with given NDV + java.util.function.BiConsumer addColWithNdv = (stats, ndv) -> { + ColStatistics cs = new ColStatistics("col", "int"); + cs.setCountDistint(ndv); + stats.addToColumnStats(Collections.singletonList(cs)); + }; + + // Empty joinKeys + Map> emptyKeys = new HashMap<>(); + Map emptyStats = new HashMap<>(); + + // Single table with <=1 row (should skip) + Map> singleRowKeys = new HashMap<>(); + singleRowKeys.put(0, Arrays.asList("col")); + Map singleRowStats = new HashMap<>(); + Statistics singleRowStat = statsWithRows.apply(1L); + addColWithNdv.accept(singleRowStat, 0L); + singleRowStats.put(0, singleRowStat); + + // Table with rows but no zero NDV + Map> noZeroNdvKeys = new HashMap<>(); + noZeroNdvKeys.put(0, Arrays.asList("col")); + Map noZeroNdvStats = new HashMap<>(); + Statistics noZeroNdvStat = statsWithRows.apply(100L); + addColWithNdv.accept(noZeroNdvStat, 50L); + noZeroNdvStats.put(0, noZeroNdvStat); + + // Table with rows and zero NDV + Map> zeroNdvKeys = new HashMap<>(); + zeroNdvKeys.put(0, Arrays.asList("col")); + Map zeroNdvStats = new HashMap<>(); + Statistics zeroNdvStat = statsWithRows.apply(100L); + addColWithNdv.accept(zeroNdvStat, 0L); + zeroNdvStats.put(0, zeroNdvStat); + + // Column not found (null ColStatistics) + Map> nullColKeys = new HashMap<>(); + nullColKeys.put(0, Arrays.asList("nonexistent")); + Map nullColStats = new HashMap<>(); + nullColStats.put(0, statsWithRows.apply(100L)); + + // Two tables: first has non-zero NDV, second has zero NDV + Map> mixedKeys = new HashMap<>(); + mixedKeys.put(0, Arrays.asList("col")); + mixedKeys.put(1, Arrays.asList("col")); + Map mixedStats = new HashMap<>(); + Statistics mixedStat0 = statsWithRows.apply(100L); + addColWithNdv.accept(mixedStat0, 50L); + mixedStats.put(0, mixedStat0); + Statistics mixedStat1 = statsWithRows.apply(100L); + addColWithNdv.accept(mixedStat1, 0L); + mixedStats.put(1, mixedStat1); + + // Two tables: first has zero NDV, second has non-zero NDV + Map> firstZeroKeys = new HashMap<>(); + firstZeroKeys.put(0, Arrays.asList("col")); + firstZeroKeys.put(1, Arrays.asList("col")); + Map firstZeroStats = new HashMap<>(); + Statistics firstZeroStat0 = statsWithRows.apply(100L); + addColWithNdv.accept(firstZeroStat0, 0L); + firstZeroStats.put(0, firstZeroStat0); + Statistics firstZeroStat1 = statsWithRows.apply(100L); + addColWithNdv.accept(firstZeroStat1, 50L); + firstZeroStats.put(1, firstZeroStat1); + + // Three tables: first two have non-zero NDV, third has zero NDV + Map> threeTableKeys = new HashMap<>(); + threeTableKeys.put(0, Arrays.asList("col")); + threeTableKeys.put(1, Arrays.asList("col")); + threeTableKeys.put(2, Arrays.asList("col")); + Map threeTableStats = new HashMap<>(); + Statistics threeStat0 = statsWithRows.apply(100L); + addColWithNdv.accept(threeStat0, 50L); + threeTableStats.put(0, threeStat0); + Statistics threeStat1 = statsWithRows.apply(100L); + addColWithNdv.accept(threeStat1, 25L); + threeTableStats.put(1, threeStat1); + Statistics threeStat2 = statsWithRows.apply(100L); + addColWithNdv.accept(threeStat2, 0L); + threeTableStats.put(2, threeStat2); + + // Two tables: first has 1 row (skipped), second has zero NDV + Map> skipFirstKeys = new HashMap<>(); + skipFirstKeys.put(0, Arrays.asList("col")); + skipFirstKeys.put(1, Arrays.asList("col")); + Map skipFirstStats = new HashMap<>(); + Statistics skipStat0 = statsWithRows.apply(1L); + addColWithNdv.accept(skipStat0, 0L); + skipFirstStats.put(0, skipStat0); + Statistics skipStat1 = statsWithRows.apply(100L); + addColWithNdv.accept(skipStat1, 0L); + skipFirstStats.put(1, skipStat1); + + return Stream.of( + Arguments.of("EmptyJoinKeys", emptyKeys, emptyStats, false), + Arguments.of("SingleRowTable", singleRowKeys, singleRowStats, false), + Arguments.of("NoZeroNdv", noZeroNdvKeys, noZeroNdvStats, false), + Arguments.of("HasZeroNdv", zeroNdvKeys, zeroNdvStats, true), + Arguments.of("NullColStatistics", nullColKeys, nullColStats, false), + Arguments.of("TwoTablesFirstHasZero", firstZeroKeys, firstZeroStats, true), + Arguments.of("TwoTablesSecondHasZero", mixedKeys, mixedStats, true), + Arguments.of("ThreeTablesThirdHasZero", threeTableKeys, threeTableStats, true), + Arguments.of("FirstSkippedSecondHasZero", skipFirstKeys, skipFirstStats, true) + ); } } diff --git a/ql/src/test/queries/clientpositive/ndv_zero_join_selectivity.q b/ql/src/test/queries/clientpositive/ndv_zero_join_selectivity.q new file mode 100644 index 000000000000..195b7d74de28 --- /dev/null +++ b/ql/src/test/queries/clientpositive/ndv_zero_join_selectivity.q @@ -0,0 +1,15 @@ +CREATE TABLE ndv_zero_t1 (id BIGINT, data STRING); +CREATE TABLE ndv_zero_t2 (id BIGINT, value STRING); + +ALTER TABLE ndv_zero_t1 UPDATE STATISTICS SET('numRows'='100000000','rawDataSize'='1000000000'); +ALTER TABLE ndv_zero_t1 UPDATE STATISTICS FOR COLUMN id SET('numDVs'='0','numNulls'='0'); +ALTER TABLE ndv_zero_t1 UPDATE STATISTICS FOR COLUMN data SET('numDVs'='1000','numNulls'='0','avgColLen'='10','maxColLen'='50'); + +ALTER TABLE ndv_zero_t2 UPDATE STATISTICS SET('numRows'='100000000','rawDataSize'='1000000000'); +ALTER TABLE ndv_zero_t2 UPDATE STATISTICS FOR COLUMN id SET('numDVs'='0','numNulls'='0'); +ALTER TABLE ndv_zero_t2 UPDATE STATISTICS FOR COLUMN value SET('numDVs'='1000','numNulls'='0','avgColLen'='10','maxColLen'='50'); + +EXPLAIN +SELECT t1.id, t2.value +FROM ndv_zero_t1 t1 +JOIN ndv_zero_t2 t2 ON t1.id = t2.id; diff --git a/ql/src/test/results/clientpositive/llap/ndv_zero_join_selectivity.q.out b/ql/src/test/results/clientpositive/llap/ndv_zero_join_selectivity.q.out new file mode 100644 index 000000000000..64c9b3e2355b --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/ndv_zero_join_selectivity.q.out @@ -0,0 +1,164 @@ +PREHOOK: query: CREATE TABLE ndv_zero_t1 (id BIGINT, data STRING) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@ndv_zero_t1 +POSTHOOK: query: CREATE TABLE ndv_zero_t1 (id BIGINT, data STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@ndv_zero_t1 +PREHOOK: query: CREATE TABLE ndv_zero_t2 (id BIGINT, value STRING) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@ndv_zero_t2 +POSTHOOK: query: CREATE TABLE ndv_zero_t2 (id BIGINT, value STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@ndv_zero_t2 +PREHOOK: query: ALTER TABLE ndv_zero_t1 UPDATE STATISTICS SET('numRows'='100000000','rawDataSize'='1000000000') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@ndv_zero_t1 +PREHOOK: Output: default@ndv_zero_t1 +POSTHOOK: query: ALTER TABLE ndv_zero_t1 UPDATE STATISTICS SET('numRows'='100000000','rawDataSize'='1000000000') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@ndv_zero_t1 +POSTHOOK: Output: default@ndv_zero_t1 +PREHOOK: query: ALTER TABLE ndv_zero_t1 UPDATE STATISTICS FOR COLUMN id SET('numDVs'='0','numNulls'='0') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@ndv_zero_t1 +PREHOOK: Output: default@ndv_zero_t1 +POSTHOOK: query: ALTER TABLE ndv_zero_t1 UPDATE STATISTICS FOR COLUMN id SET('numDVs'='0','numNulls'='0') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@ndv_zero_t1 +POSTHOOK: Output: default@ndv_zero_t1 +PREHOOK: query: ALTER TABLE ndv_zero_t1 UPDATE STATISTICS FOR COLUMN data SET('numDVs'='1000','numNulls'='0','avgColLen'='10','maxColLen'='50') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@ndv_zero_t1 +PREHOOK: Output: default@ndv_zero_t1 +POSTHOOK: query: ALTER TABLE ndv_zero_t1 UPDATE STATISTICS FOR COLUMN data SET('numDVs'='1000','numNulls'='0','avgColLen'='10','maxColLen'='50') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@ndv_zero_t1 +POSTHOOK: Output: default@ndv_zero_t1 +PREHOOK: query: ALTER TABLE ndv_zero_t2 UPDATE STATISTICS SET('numRows'='100000000','rawDataSize'='1000000000') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@ndv_zero_t2 +PREHOOK: Output: default@ndv_zero_t2 +POSTHOOK: query: ALTER TABLE ndv_zero_t2 UPDATE STATISTICS SET('numRows'='100000000','rawDataSize'='1000000000') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@ndv_zero_t2 +POSTHOOK: Output: default@ndv_zero_t2 +PREHOOK: query: ALTER TABLE ndv_zero_t2 UPDATE STATISTICS FOR COLUMN id SET('numDVs'='0','numNulls'='0') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@ndv_zero_t2 +PREHOOK: Output: default@ndv_zero_t2 +POSTHOOK: query: ALTER TABLE ndv_zero_t2 UPDATE STATISTICS FOR COLUMN id SET('numDVs'='0','numNulls'='0') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@ndv_zero_t2 +POSTHOOK: Output: default@ndv_zero_t2 +PREHOOK: query: ALTER TABLE ndv_zero_t2 UPDATE STATISTICS FOR COLUMN value SET('numDVs'='1000','numNulls'='0','avgColLen'='10','maxColLen'='50') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@ndv_zero_t2 +PREHOOK: Output: default@ndv_zero_t2 +POSTHOOK: query: ALTER TABLE ndv_zero_t2 UPDATE STATISTICS FOR COLUMN value SET('numDVs'='1000','numNulls'='0','avgColLen'='10','maxColLen'='50') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@ndv_zero_t2 +POSTHOOK: Output: default@ndv_zero_t2 +PREHOOK: query: EXPLAIN +SELECT t1.id, t2.value +FROM ndv_zero_t1 t1 +JOIN ndv_zero_t2 t2 ON t1.id = t2.id +PREHOOK: type: QUERY +PREHOOK: Input: default@ndv_zero_t1 +PREHOOK: Input: default@ndv_zero_t2 +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN +SELECT t1.id, t2.value +FROM ndv_zero_t1 t1 +JOIN ndv_zero_t2 t2 ON t1.id = t2.id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ndv_zero_t1 +POSTHOOK: Input: default@ndv_zero_t2 +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t1 + filterExpr: id is not null (type: boolean) + Statistics: Num rows: 100000000 Data size: 800000000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: id is not null (type: boolean) + Statistics: Num rows: 100000000 Data size: 800000000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 100000000 Data size: 800000000 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: bigint) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: bigint) + Statistics: Num rows: 100000000 Data size: 800000000 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: all inputs + Map 3 + Map Operator Tree: + TableScan + alias: t2 + filterExpr: id is not null (type: boolean) + Statistics: Num rows: 100000000 Data size: 10200000000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: id is not null (type: boolean) + Statistics: Num rows: 100000000 Data size: 10200000000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: bigint), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 100000000 Data size: 10200000000 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: bigint) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: bigint) + Statistics: Num rows: 100000000 Data size: 10200000000 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string) + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: bigint) + 1 _col0 (type: bigint) + outputColumnNames: _col0, _col2 + Statistics: Num rows: 110000002 Data size: 11220000204 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: bigint), _col2 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 110000002 Data size: 11220000204 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 110000002 Data size: 11220000204 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + diff --git a/ql/src/test/results/clientpositive/llap/vector_binary_join_groupby.q.out b/ql/src/test/results/clientpositive/llap/vector_binary_join_groupby.q.out index 41bc14e5e354..6de2c9fbf920 100644 --- a/ql/src/test/results/clientpositive/llap/vector_binary_join_groupby.q.out +++ b/ql/src/test/results/clientpositive/llap/vector_binary_join_groupby.q.out @@ -170,7 +170,7 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20, _col21 input vertices: 1 Map 3 - Statistics: Num rows: 10000 Data size: 6819968 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 110 Data size: 74988 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: hash(_col0,_col1,_col2,_col3,_col4,_col5,_col6,_col7,_col8,_col9,_col10,_col11,_col12,_col13,_col14,_col15,_col16,_col17,_col18,_col19,_col20,_col21) (type: int) outputColumnNames: _col0 @@ -179,7 +179,7 @@ STAGE PLANS: native: true projectedOutputColumnNums: [23] selectExpressions: VectorUDFAdaptor(hash(_col0,_col1,_col2,_col3,_col4,_col5,_col6,_col7,_col8,_col9,_col10,_col11,_col12,_col13,_col14,_col15,_col16,_col17,_col18,_col19,_col20,_col21)) -> 23:int - Statistics: Num rows: 10000 Data size: 6819968 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 110 Data size: 74988 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: sum(_col0) Group By Vectorization: