Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ public class ColStatistics {
private boolean isPrimaryKey;
private boolean isEstimated;
private boolean isFilteredColumn;
private boolean isConst;
private byte[] bitVectors;
private byte[] histogram;

Expand Down Expand Up @@ -155,6 +156,8 @@ public String toString() {

sb.append(" isEstimated: ");
sb.append(isEstimated);
sb.append(" isConst: ");
sb.append(isConst);
return sb.toString();
}

Expand All @@ -171,6 +174,7 @@ public ColStatistics clone() {
clone.setPrimaryKey(isPrimaryKey);
clone.setIsEstimated(isEstimated);
clone.setIsFilteredColumn(isFilteredColumn);
clone.setConst(isConst);
if (range != null ) {
clone.setRange(range.clone());
}
Expand Down Expand Up @@ -232,4 +236,12 @@ public boolean isFilteredColumn() {
return isFilteredColumn;
}

public boolean isConst() {
return isConst;
}

public void setConst(boolean isConst) {
this.isConst = isConst;
}

}
7 changes: 6 additions & 1 deletion ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -823,9 +823,11 @@ public static ColStatistics getColStatistics(ColumnStatisticsObj cso, String col
if (numTrues == 0 && numFalses == 0) {
// All NULL column - no non-null distinct values
cs.setCountDistint(0);
cs.setConst(true);
} else if (numTrues == 0 || numFalses == 0) {
// One value type confirmed absent (=0), other is present (>0) or unknown (<0)
cs.setCountDistint(1);
cs.setConst(csd.getBooleanStats().getNumNulls() == 0 && (numTrues > 0 || numFalses > 0));
} else {
// Both != 0: either both present (>0), both unknown (<0), or one present + one unknown
cs.setCountDistint(2);
Expand Down Expand Up @@ -1646,6 +1648,7 @@ private static ColStatistics buildColStatForConstant(HiveConf conf, long numRows
colStats.setAvgColLen(avgColSize);
colStats.setCountDistint(countDistincts);
colStats.setNumNulls(numNulls);
colStats.setConst(true);

Optional<Number> value = getConstValue(encd);
value.ifPresent(number -> colStats.setRange(number, number));
Expand Down Expand Up @@ -2112,7 +2115,9 @@ private static List<Long> extractNDVGroupingColumns(List<ColStatistics> colStats
for (ColStatistics cs : colStats) {
if (cs != null) {
long ndv = cs.getCountDistint();
if (cs.getNumNulls() > 0) {
if (cs.isConst()) {
ndv = 1;
} else if (ndv > 0 && cs.getNumNulls() > 0) {
ndv = StatsUtils.safeAdd(ndv, 1);
}
Comment on lines 2117 to 2122
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we know that a column is a constant then shouldn't ndv = 1? Why do we want to do +1 ? Why do we care about the number of nulls?

Copy link
Copy Markdown
Contributor Author

@konstantinb konstantinb May 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. The +1 accounts for NULL being its own GROUP BY bucket — that's why numNulls > 0 gates it for the non-const case. The || cs.isConst() clause was specifically to handle the all-NULL
const case (countDistint=0, numNulls>0) where we still want one bucket.

But you're right that ndv = 1 is the cleaner way to express it. The current form works only because all three setConst(true) sites today satisfy the invariant
!(countDistint>0 && numNulls>0)
— that's implicit and would silently give the wrong answer (2 buckets) if some future code path marked a column const while preserving both a non-null distinct count and nulls (although this would also completely redefine what an NDV of a "const" means).

I'll switch to a much more readable variant:

if (cs.isConst()) {
  ndv = 1;
} else if (ndv > 0 && cs.getNumNulls() > 0) {
  ndv = StatsUtils.safeAdd(ndv, 1);
}

ndvValues.add(ndv);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ public void add(ColStatistics stat) {
if (stat.isFilteredColumn()) {
result.setFilterColumn();
}
result.setConst(result.isConst() && stat.isConst());
}
public Optional<ColStatistics> getResult() {
return Optional.of(result);
Expand Down
85 changes: 85 additions & 0 deletions ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.CsvSource;
import org.junit.jupiter.params.provider.MethodSource;

import com.google.common.collect.Sets;
Expand Down Expand Up @@ -367,6 +368,46 @@ void testGetColStatisticsBooleanAllNull() {

assertNotNull(cs);
assertEquals(0, cs.getCountDistint(), "Boolean NDV should be 0 for all-NULL column");
assertEquals(true, cs.isConst(), "all-NULL boolean column must be marked isConst");
}

@ParameterizedTest(name = "isConst=true for verified single-value boolean (numTrues={0}, numFalses={1}, numNulls=0)")
@org.junit.jupiter.params.provider.CsvSource({"100, 0", "0, 100"})
void testGetColStatisticsBooleanIsConstForVerifiedSingleValue(long numTrues, long numFalses) {
ColumnStatisticsObj cso = new ColumnStatisticsObj();
cso.setColName("bool_col");
cso.setColType(serdeConstants.BOOLEAN_TYPE_NAME);
BooleanColumnStatsData boolStats = new BooleanColumnStatsData();
boolStats.setNumTrues(numTrues);
boolStats.setNumFalses(numFalses);
boolStats.setNumNulls(0);
ColumnStatisticsData data = new ColumnStatisticsData();
data.setBooleanStats(boolStats);
cso.setStatsData(data);

ColStatistics cs = StatsUtils.getColStatistics(cso, "bool_col");

assertEquals(true, cs.isConst());
}

@ParameterizedTest(name = "isConst stays false for boolean (numTrues={0}, numFalses={1})")
@org.junit.jupiter.params.provider.CsvSource({
"100, 0", "0, 100", "-1, 100", "100, -1", "50, 50"})
void testGetColStatisticsBooleanIsConstNotSetForNonAllNullCases(long numTrues, long numFalses) {
ColumnStatisticsObj cso = new ColumnStatisticsObj();
cso.setColName("bool_col");
cso.setColType(serdeConstants.BOOLEAN_TYPE_NAME);
BooleanColumnStatsData boolStats = new BooleanColumnStatsData();
boolStats.setNumTrues(numTrues);
boolStats.setNumFalses(numFalses);
boolStats.setNumNulls(10);
ColumnStatisticsData data = new ColumnStatisticsData();
data.setBooleanStats(boolStats);
cso.setStatsData(data);

ColStatistics cs = StatsUtils.getColStatistics(cso, "bool_col");

assertEquals(false, cs.isConst());
}

@Test
Expand Down Expand Up @@ -565,4 +606,48 @@ void testGetColStatisticsTimestampType() {
assertEquals(1700000000L, range.maxValue.longValue(), "maxValue mismatch for TIMESTAMP");
}

@Test
void testColStatisticsIsConstDefaultsFalse() {
ColStatistics cs = new ColStatistics("c", "int");
assertEquals(false, cs.isConst(), "isConst should default to false");
}

@Test
void testColStatisticsIsConstSetterAndClone() {
ColStatistics cs = new ColStatistics("c", "int");
cs.setConst(true);
assertEquals(true, cs.isConst(), "setConst(true) should be observable via isConst()");

ColStatistics clone = cs.clone();
assertEquals(true, clone.isConst(), "clone() must propagate isConst");

cs.setConst(false);
assertEquals(false, cs.isConst(), "setConst(false) should clear the flag");
assertEquals(true, clone.isConst(), "Clone should be independent of source after mutation");
}

@ParameterizedTest(name = "isConst=true => NDV contribution = 1 (countDistint={0}, numNulls={1})")
@CsvSource({"1, 0", "0, 10", "1, 10", "0, 0"})
void testComputeNDVGroupingColumnsConstColumnContributesOne(long countDistint, long numNulls) {
ColStatistics cs = new ColStatistics("c", "int");
cs.setCountDistint(countDistint);
cs.setNumNulls(numNulls);
cs.setConst(true);

long ndv = StatsUtils.computeNDVGroupingColumns(Collections.singletonList(cs), new Statistics(), false);

assertEquals(1L, ndv);
}

@Test
void testComputeNDVGroupingColumnsNonConstWithNullsAddsOne() {
ColStatistics cs = new ColStatistics("c", "int");
cs.setCountDistint(7);
cs.setNumNulls(3);

long ndv = StatsUtils.computeNDVGroupingColumns(Collections.singletonList(cs), new Statistics(), false);

assertEquals(8L, ndv);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@

import org.apache.hadoop.hive.ql.plan.ColStatistics;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvSource;

class TestPessimisticStatCombiner {

Expand Down Expand Up @@ -155,6 +157,21 @@ void testCombineBothUnknownNumTruesAndNumFalses() {
assertEquals(-1, combined.getNumFalses(), "Both unknown should result in unknown (-1)");
}

@ParameterizedTest(name = "combine isConst({0}, {1}) = {2}")
@CsvSource({"true, true, true", "true, false, false", "false, true, false", "false, false, false"})
void testCombineIsConstAndSemantics(boolean stat1Const, boolean stat2Const, boolean expected) {
ColStatistics stat1 = createStat("col1", "int", 0, 100, 4.0);
stat1.setConst(stat1Const);
ColStatistics stat2 = createStat("col2", "int", 0, 100, 4.0);
stat2.setConst(stat2Const);

PessimisticStatCombiner combiner = new PessimisticStatCombiner();
combiner.add(stat1);
combiner.add(stat2);

assertEquals(expected, combiner.getResult().get().isConst());
}

private ColStatistics createStat(String name, String type, long ndv, long numNulls, double avgColLen) {
ColStatistics stat = new ColStatistics(name, type);
stat.setCountDistint(ndv);
Expand Down
30 changes: 30 additions & 0 deletions ql/src/test/queries/clientpositive/groupby_unknown_ndv.q
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
-- HIVE-29556: GROUP BY of a column with unknown NDV (NDV=0 sentinel + numNulls>0)
-- must not collapse to "1 row" estimate. Both probes feed a join sized so that
-- master's bogus 1-row estimate triggers Map Join while the heuristic fallback
-- forces Merge Join.

SET hive.auto.convert.join=true;

CREATE TABLE big (k BIGINT);
CREATE TABLE small (id BIGINT, name STRING);

ALTER TABLE big UPDATE STATISTICS SET('numRows'='100000000');
ALTER TABLE big UPDATE STATISTICS for column k SET ('numDVs'='0','numNulls'='100000000');

ALTER TABLE small UPDATE STATISTICS SET('numRows'='1000000');
ALTER TABLE small UPDATE STATISTICS for column id SET ('numDVs'='1000000','numNulls'='0');
ALTER TABLE small UPDATE STATISTICS for column name SET ('numDVs'='1000000','numNulls'='0','avgColLen'='100','maxColLen'='100');

-- U1: direct column with unknown NDV.
EXPLAIN
SELECT s.name, g.cnt
FROM (SELECT k, COUNT(*) AS cnt FROM big GROUP BY k) g
JOIN small s ON g.cnt = s.id;

-- U2: PessimisticStatCombiner output for CASE WHEN with one NULL branch.
EXPLAIN
SELECT s.name, g.cnt
FROM (SELECT x, COUNT(*) AS cnt
FROM (SELECT CASE WHEN k > 0 THEN k ELSE cast(NULL AS BIGINT) END AS x FROM big) t
GROUP BY x) g
JOIN small s ON g.cnt = s.id;
Loading
Loading