From 8896ff0bc543d431a7c470c69664f4d284090352 Mon Sep 17 00:00:00 2001 From: kasakrisz Date: Tue, 12 May 2026 10:34:57 +0200 Subject: [PATCH] HIVE-29617: Unable to load column statistics of Iceberg table after upgrading Hive --- .../mr/hive/HiveIcebergStorageHandler.java | 31 ++- .../iceberg/mr/hive/IcebergTableUtil.java | 4 +- .../mr/hive/stats/ColumnStatsConverter.java | 177 ++++++++++++++++++ .../mr/hive/stats/HiveBinaryConverter.java | 41 ++++ .../mr/hive/stats/HiveBinaryStats.java | 31 +++ .../mr/hive/stats/HiveBooleanConverter.java | 40 ++++ .../mr/hive/stats/HiveBooleanStats.java | 31 +++ .../mr/hive/stats/HiveColumnStatistics.java | 33 ++++ .../hive/stats/HiveColumnStatisticsData.java | 26 +++ .../hive/stats/HiveColumnStatisticsDesc.java | 34 ++++ .../hive/stats/HiveColumnStatisticsObj.java | 31 +++ .../mr/hive/stats/HiveDateConverter.java | 55 ++++++ .../iceberg/mr/hive/stats/HiveDateStats.java | 33 ++++ .../mr/hive/stats/HiveDecimalConverter.java | 61 ++++++ .../mr/hive/stats/HiveDecimalStats.java | 33 ++++ .../mr/hive/stats/HiveDoubleConverter.java | 49 +++++ .../mr/hive/stats/HiveDoubleStats.java | 33 ++++ .../mr/hive/stats/HiveLongConverter.java | 46 +++++ .../iceberg/mr/hive/stats/HiveLongStats.java | 33 ++++ .../mr/hive/stats/HiveSerializableDate.java | 27 +++ .../hive/stats/HiveSerializableDecimal.java | 27 +++ .../hive/stats/HiveSerializableTimestamp.java | 27 +++ .../mr/hive/stats/HiveStringConverter.java | 45 +++++ .../mr/hive/stats/HiveStringStats.java | 32 ++++ .../mr/hive/stats/HiveTimestampConverter.java | 61 ++++++ .../mr/hive/stats/HiveTimestampStats.java | 33 ++++ .../mr/hive/stats/HiveTypeStatsConverter.java | 24 +++ .../hadoop/hive/ql/stats/StatsUtils.java | 19 +- 28 files changed, 1101 insertions(+), 16 deletions(-) create mode 100644 iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/ColumnStatsConverter.java create mode 100644 iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveBinaryConverter.java create mode 100644 iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveBinaryStats.java create mode 100644 iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveBooleanConverter.java create mode 100644 iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveBooleanStats.java create mode 100644 iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveColumnStatistics.java create mode 100644 iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveColumnStatisticsData.java create mode 100644 iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveColumnStatisticsDesc.java create mode 100644 iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveColumnStatisticsObj.java create mode 100644 iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveDateConverter.java create mode 100644 iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveDateStats.java create mode 100644 iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveDecimalConverter.java create mode 100644 iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveDecimalStats.java create mode 100644 iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveDoubleConverter.java create mode 100644 iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveDoubleStats.java create mode 100644 iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveLongConverter.java create mode 100644 iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveLongStats.java create mode 100644 iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveSerializableDate.java create mode 100644 iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveSerializableDecimal.java create mode 100644 iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveSerializableTimestamp.java create mode 100644 iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveStringConverter.java create mode 100644 iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveStringStats.java create mode 100644 iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveTimestampConverter.java create mode 100644 iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveTimestampStats.java create mode 100644 iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveTypeStatsConverter.java diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java index 65852f1a8553..8d02647111de 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java @@ -188,6 +188,9 @@ import org.apache.iceberg.mr.InputFormatConfig; import org.apache.iceberg.mr.hive.actions.HiveIcebergDeleteOrphanFiles; import org.apache.iceberg.mr.hive.plan.IcebergBucketFunction; +import org.apache.iceberg.mr.hive.stats.ColumnStatsConverter; +import org.apache.iceberg.mr.hive.stats.HiveColumnStatistics; +import org.apache.iceberg.mr.hive.stats.HiveColumnStatisticsObj; import org.apache.iceberg.mr.hive.udf.GenericUDFIcebergZorder; import org.apache.iceberg.puffin.Blob; import org.apache.iceberg.puffin.BlobMetadata; @@ -644,8 +647,12 @@ private boolean writeColStats(List colStats, Table tbl) { Map properties = isTblLevel ? Map.of() : Map.of(PARTITION, String.valueOf(stats.getStatsDesc().getPartName())); - List statsObjects = isTblLevel ? - stats.getStatsObj() : List.of(stats); + List statsObjects; + if (isTblLevel) { + statsObjects = stats.getStatsObj().stream().map(ColumnStatsConverter::fromThrift).toList(); + } else { + statsObjects = List.of(ColumnStatsConverter.fromThrift(stats)); + } List fieldIds = null; @@ -664,11 +671,11 @@ private boolean writeColStats(List colStats, Table tbl) { if (isTblLevel) { fieldIds = List.of(schema.findField( - ((ColumnStatisticsObj) statsObj).getColName()).fieldId()); + ((HiveColumnStatisticsObj) statsObj).colName()).fieldId()); } writer.add(new Blob( - ColumnStatisticsObj.class.getSimpleName(), + HiveColumnStatisticsObj.class.getSimpleName(), fieldIds, snapshotId, snapshotSequenceNumber, @@ -753,7 +760,9 @@ public List getColStatistics(org.apache.hadoop.hive.ql.meta filter = null; } - return IcebergTableUtil.readColStats(table, snapshot.snapshotId(), filter); + List columnStatisticsObjList = + IcebergTableUtil.readColStats(table, snapshot.snapshotId(), filter); + return columnStatisticsObjList.stream().map(ColumnStatsConverter::toThrift).toList(); } @Override @@ -773,7 +782,9 @@ public AggrStats getAggrColStatsFor(org.apache.hadoop.hive.ql.metadata.Table hms Set partitions = Sets.newHashSet(partNames); Predicate filter = metadata -> partitions.contains(metadata.properties().get(PARTITION)); - List partStats = IcebergTableUtil.readColStats(table, snapshot.snapshotId(), filter); + List storedStats = + IcebergTableUtil.readColStats(table, snapshot.snapshotId(), filter); + List partStats = storedStats.stream().map(ColumnStatsConverter::toThrift).toList(); partStats.forEach(colStats -> colStats.getStatsObj().removeIf(statsObj -> !colNames.contains(statsObj.getColName()))); @@ -831,13 +842,17 @@ private void checkAndMergeColStats(List statsNew, Table tbl) t boolean isTblLevel = statsNew.getFirst().getStatsDesc().isIsTblLevel(); Map oldStatsMap = Maps.newHashMap(); - List statsOld = IcebergTableUtil.readColStats(tbl, previousSnapshotId, null); - + List statsOld; if (!isTblLevel) { + List storedStats = IcebergTableUtil.readColStats(tbl, previousSnapshotId, null); + statsOld = storedStats.stream().map(ColumnStatsConverter::toThrift).toList(); + for (ColumnStatistics statsObjOld : (List) statsOld) { oldStatsMap.put(statsObjOld.getStatsDesc().getPartName(), statsObjOld); } } else { + List storedStatObjects = IcebergTableUtil.readColStats(tbl, previousSnapshotId, null); + statsOld = storedStatObjects.stream().map(ColumnStatsConverter::toThrift).toList(); statsOld = Collections.singletonList( new ColumnStatistics(null, (List) statsOld)); } diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java index 386473f46fee..03016dc744a9 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java @@ -47,7 +47,6 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.IMetaStoreClient; import org.apache.hadoop.hive.metastore.Warehouse; -import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; import org.apache.hadoop.hive.metastore.utils.TableFetcher; @@ -101,6 +100,7 @@ import org.apache.iceberg.io.FileIO; import org.apache.iceberg.mr.Catalogs; import org.apache.iceberg.mr.InputFormatConfig; +import org.apache.iceberg.mr.hive.stats.HiveColumnStatisticsObj; import org.apache.iceberg.puffin.BlobMetadata; import org.apache.iceberg.puffin.Puffin; import org.apache.iceberg.puffin.PuffinReader; @@ -256,7 +256,7 @@ static String getColStatsPath(Table table, long snapshotId) { return table.statisticsFiles().stream() .filter(stats -> stats.snapshotId() == snapshotId) .filter(stats -> stats.blobMetadata().stream() - .anyMatch(metadata -> ColumnStatisticsObj.class.getSimpleName().equals(metadata.type())) + .anyMatch(metadata -> HiveColumnStatisticsObj.class.getSimpleName().equals(metadata.type())) ) .map(StatisticsFile::path) .findAny().orElse(null); diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/ColumnStatsConverter.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/ColumnStatsConverter.java new file mode 100644 index 000000000000..0894f5a5c1d1 --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/ColumnStatsConverter.java @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.mr.hive.stats; + +import java.nio.ByteBuffer; +import java.util.stream.Collectors; +import org.apache.hadoop.hive.metastore.api.ColumnStatistics; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; + +public final class ColumnStatsConverter { + private static final HiveBooleanConverter BOOLEAN = new HiveBooleanConverter(); + private static final HiveLongConverter LONG = new HiveLongConverter(); + private static final HiveDoubleConverter DOUBLE = new HiveDoubleConverter(); + private static final HiveStringConverter STRING = new HiveStringConverter(); + private static final HiveBinaryConverter BINARY = new HiveBinaryConverter(); + private static final HiveDecimalConverter DECIMAL = new HiveDecimalConverter(); + private static final HiveDateConverter DATE = new HiveDateConverter(); + private static final HiveTimestampConverter TIMESTAMP = new HiveTimestampConverter(); + + private ColumnStatsConverter() { + + } + + public static HiveColumnStatistics fromThrift(ColumnStatistics columnStatistics) { + if (columnStatistics == null) { + return null; + } + + ColumnStatisticsDesc tDesc = columnStatistics.getStatsDesc(); + return new HiveColumnStatistics( + new HiveColumnStatisticsDesc(tDesc.isIsTblLevel(), tDesc.getDbName(), tDesc.getTableName(), + tDesc.getPartName(), tDesc.isSetLastAnalyzed() ? tDesc.getLastAnalyzed() : null, tDesc.getCatName()), + columnStatistics.getStatsObj().stream().map(ColumnStatsConverter::fromThrift).collect(Collectors.toList()), + columnStatistics.isSetIsStatsCompliant() ? columnStatistics.isIsStatsCompliant() : null, + columnStatistics.getEngine() + ); + } + + public static HiveColumnStatisticsObj fromThrift(ColumnStatisticsObj columnStatisticsObj) { + return new HiveColumnStatisticsObj( + columnStatisticsObj.getColName(), + columnStatisticsObj.getColType(), + fromThriftData(columnStatisticsObj.getStatsData())); + } + + private static HiveColumnStatisticsData fromThriftData(ColumnStatisticsData columnStatisticsData) { + if (columnStatisticsData.isSetBooleanStats()) { + return BOOLEAN.fromThrift(columnStatisticsData.getBooleanStats()); + } + + if (columnStatisticsData.isSetLongStats()) { + return LONG.fromThrift(columnStatisticsData.getLongStats()); + } + + if (columnStatisticsData.isSetDoubleStats()) { + return DOUBLE.fromThrift(columnStatisticsData.getDoubleStats()); + } + + if (columnStatisticsData.isSetStringStats()) { + return STRING.fromThrift(columnStatisticsData.getStringStats()); + } + + if (columnStatisticsData.isSetBinaryStats()) { + return BINARY.fromThrift(columnStatisticsData.getBinaryStats()); + } + + if (columnStatisticsData.isSetDecimalStats()) { + return DECIMAL.fromThrift(columnStatisticsData.getDecimalStats()); + } + + if (columnStatisticsData.isSetDateStats()) { + return DATE.fromThrift(columnStatisticsData.getDateStats()); + } + + if (columnStatisticsData.isSetTimestampStats()) { + return TIMESTAMP.fromThrift(columnStatisticsData.getTimestampStats()); + } + + throw new UnsupportedOperationException("Unsupported Hive Stat Type"); + } + + public static ColumnStatistics toThrift(HiveColumnStatistics record) { + if (record == null) { + return null; + } + + ColumnStatistics result = new ColumnStatistics(); + + if (record.statsDesc() != null) { + result.setStatsDesc(toThrift(record.statsDesc())); + } + + if (record.statsObj() != null) { + result.setStatsObj(record.statsObj().stream() + .map(ColumnStatsConverter::toThrift) + .collect(java.util.stream.Collectors.toList())); + } + + if (record.isStatsCompliant() != null) { + result.setIsStatsCompliant(record.isStatsCompliant()); + } + + result.setEngine(record.engine()); + + return result; + } + + private static ColumnStatisticsDesc toThrift(HiveColumnStatisticsDesc columnStatisticsDesc) { + var result = new ColumnStatisticsDesc( + columnStatisticsDesc.isTblLevel(), + columnStatisticsDesc.dbName(), + columnStatisticsDesc.tableName()); + + if (columnStatisticsDesc.partName() != null) { + result.setPartName(columnStatisticsDesc.partName()); + } + if (columnStatisticsDesc.lastAnalyzed() != null) { + result.setLastAnalyzed(columnStatisticsDesc.lastAnalyzed()); + } + if (columnStatisticsDesc.catName() != null) { + result.setCatName(columnStatisticsDesc.catName()); + } + + return result; + } + + public static ColumnStatisticsObj toThrift(HiveColumnStatisticsObj columnStatisticsObj) { + return new ColumnStatisticsObj( + columnStatisticsObj.colName(), + columnStatisticsObj.colType(), + toThrift(columnStatisticsObj.statsData()) + ); + } + + private static ColumnStatisticsData toThrift(HiveColumnStatisticsData columnStatisticsData) { + ColumnStatisticsData result = new ColumnStatisticsData(); + switch (columnStatisticsData) { + case HiveBooleanStats statsValue -> result.setBooleanStats(BOOLEAN.toThrift(statsValue)); + case HiveLongStats statsValue -> result.setLongStats(LONG.toThrift(statsValue)); + case HiveDoubleStats statsValue -> result.setDoubleStats(DOUBLE.toThrift(statsValue)); + case HiveStringStats statsValue -> result.setStringStats(STRING.toThrift(statsValue)); + case HiveBinaryStats statsValue -> result.setBinaryStats(BINARY.toThrift(statsValue)); + case HiveDecimalStats statsValue -> result.setDecimalStats(DECIMAL.toThrift(statsValue)); + case HiveDateStats statsValue -> result.setDateStats(DATE.toThrift(statsValue)); + case HiveTimestampStats statsValue -> result.setTimestampStats(TIMESTAMP.toThrift(statsValue)); + } + return result; + } + + public static byte[] toBytes(ByteBuffer byteBuffer) { + if (byteBuffer == null) { + return null; + } + + byte[] arr = new byte[byteBuffer.remaining()]; + byteBuffer.duplicate().get(arr); + return arr; + } +} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveBinaryConverter.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveBinaryConverter.java new file mode 100644 index 000000000000..a3dbfda55552 --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveBinaryConverter.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.mr.hive.stats; + +import org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData; + +public class HiveBinaryConverter implements HiveTypeStatsConverter { + + public HiveBinaryStats fromThrift(BinaryColumnStatsData binaryColumnStatsData) { + return new HiveBinaryStats( + binaryColumnStatsData.getMaxColLen(), + binaryColumnStatsData.getAvgColLen(), + binaryColumnStatsData.getNumNulls(), + ColumnStatsConverter.toBytes(binaryColumnStatsData.bufferForBitVectors())); + } + + public BinaryColumnStatsData toThrift(HiveBinaryStats binaryStats) { + BinaryColumnStatsData binaryColumnStatsData = + new BinaryColumnStatsData(binaryStats.maxColLen(), binaryStats.avgColLen(), binaryStats.numNulls()); + + binaryColumnStatsData.setBitVectors(binaryStats.bitVectors()); + + return binaryColumnStatsData; + } +} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveBinaryStats.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveBinaryStats.java new file mode 100644 index 000000000000..f1a972013bc7 --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveBinaryStats.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.mr.hive.stats; + +import java.io.Serial; + +public record HiveBinaryStats( + long maxColLen, + double avgColLen, + long numNulls, + byte[] bitVectors +) implements HiveColumnStatisticsData { + @Serial + private static final long serialVersionUID = 1L; +} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveBooleanConverter.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveBooleanConverter.java new file mode 100644 index 000000000000..d06badc2d2b3 --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveBooleanConverter.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.mr.hive.stats; + +import org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData; + +public class HiveBooleanConverter implements HiveTypeStatsConverter { + public HiveBooleanStats fromThrift(BooleanColumnStatsData booleanColumnStatsData) { + return new HiveBooleanStats( + booleanColumnStatsData.getNumTrues(), + booleanColumnStatsData.getNumFalses(), + booleanColumnStatsData.getNumNulls(), + ColumnStatsConverter.toBytes(booleanColumnStatsData.bufferForBitVectors())); + } + + public BooleanColumnStatsData toThrift(HiveBooleanStats booleanStats) { + BooleanColumnStatsData booleanColumnStatsData = new BooleanColumnStatsData( + booleanStats.numTrues(), + booleanStats.numFalses(), + booleanStats.numNulls()); + booleanColumnStatsData.setBitVectors(booleanStats.bitVectors()); + return booleanColumnStatsData; + } +} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveBooleanStats.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveBooleanStats.java new file mode 100644 index 000000000000..e70f9a4a2dfa --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveBooleanStats.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.mr.hive.stats; + +import java.io.Serial; + +public record HiveBooleanStats( + long numTrues, + long numFalses, + long numNulls, + byte[] bitVectors +) implements HiveColumnStatisticsData { + @Serial + private static final long serialVersionUID = 1L; +} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveColumnStatistics.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveColumnStatistics.java new file mode 100644 index 000000000000..9cca98eafe86 --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveColumnStatistics.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.mr.hive.stats; + +import java.io.Serial; +import java.io.Serializable; +import java.util.List; + +public record HiveColumnStatistics( + HiveColumnStatisticsDesc statsDesc, + List statsObj, + Boolean isStatsCompliant, + String engine +) implements Serializable { + @Serial + private static final long serialVersionUID = 1L; +} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveColumnStatisticsData.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveColumnStatisticsData.java new file mode 100644 index 000000000000..22d8a906d86d --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveColumnStatisticsData.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.mr.hive.stats; + +import java.io.Serializable; + +public sealed interface HiveColumnStatisticsData extends Serializable + permits HiveBooleanStats, HiveLongStats, HiveDoubleStats, HiveStringStats, + HiveBinaryStats, HiveDecimalStats, HiveDateStats, HiveTimestampStats { +} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveColumnStatisticsDesc.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveColumnStatisticsDesc.java new file mode 100644 index 000000000000..2eabdbeda242 --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveColumnStatisticsDesc.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.mr.hive.stats; + +import java.io.Serial; +import java.io.Serializable; + +public record HiveColumnStatisticsDesc( + boolean isTblLevel, + String dbName, + String tableName, + String partName, + Long lastAnalyzed, + String catName +) implements Serializable { + @Serial + private static final long serialVersionUID = 1L; +} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveColumnStatisticsObj.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveColumnStatisticsObj.java new file mode 100644 index 000000000000..ddc806beb3bb --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveColumnStatisticsObj.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.mr.hive.stats; + +import java.io.Serial; +import java.io.Serializable; + +public record HiveColumnStatisticsObj( + String colName, + String colType, + HiveColumnStatisticsData statsData +) implements Serializable { + @Serial + private static final long serialVersionUID = 1L; +} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveDateConverter.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveDateConverter.java new file mode 100644 index 000000000000..204791c13107 --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveDateConverter.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.mr.hive.stats; + +import org.apache.hadoop.hive.metastore.api.Date; +import org.apache.hadoop.hive.metastore.api.DateColumnStatsData; + +public class HiveDateConverter implements HiveTypeStatsConverter { + public HiveDateStats fromThrift(DateColumnStatsData dateColumnStatsData) { + return new HiveDateStats( + fromThrift(dateColumnStatsData.getLowValue()), + fromThrift(dateColumnStatsData.getHighValue()), + dateColumnStatsData.getNumNulls(), + dateColumnStatsData.getNumDVs(), + ColumnStatsConverter.toBytes(dateColumnStatsData.bufferForBitVectors()), + ColumnStatsConverter.toBytes(dateColumnStatsData.bufferForHistogram())); + } + + private HiveSerializableDate fromThrift(Date date) { + return (date == null) ? null : new HiveSerializableDate(date.getDaysSinceEpoch()); + } + + public DateColumnStatsData toThrift(HiveDateStats dateStats) { + DateColumnStatsData dateColumnStatsData = new DateColumnStatsData(dateStats.numNulls(), dateStats.numDVs()); + if (dateStats.lowValue() != null) { + dateColumnStatsData.setLowValue(toThrift(dateStats.lowValue())); + } + if (dateStats.highValue() != null) { + dateColumnStatsData.setHighValue(toThrift(dateStats.highValue())); + } + dateColumnStatsData.setBitVectors(dateStats.bitVectors()); + dateColumnStatsData.setHistogram(dateStats.histogram()); + return dateColumnStatsData; + } + + private Date toThrift(HiveSerializableDate serializableDate) { + return new Date(serializableDate.daysSinceEpoch()); + } +} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveDateStats.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveDateStats.java new file mode 100644 index 000000000000..2573af25d9d2 --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveDateStats.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.mr.hive.stats; + +import java.io.Serial; + +public record HiveDateStats( + HiveSerializableDate lowValue, + HiveSerializableDate highValue, + long numNulls, + long numDVs, + byte[] bitVectors, + byte[] histogram +) implements HiveColumnStatisticsData { + @Serial + private static final long serialVersionUID = 1L; +} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveDecimalConverter.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveDecimalConverter.java new file mode 100644 index 000000000000..631c46c38d9b --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveDecimalConverter.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.mr.hive.stats; + +import java.nio.ByteBuffer; +import org.apache.hadoop.hive.metastore.api.Decimal; +import org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData; + +public class HiveDecimalConverter implements HiveTypeStatsConverter { + + public HiveDecimalStats fromThrift(DecimalColumnStatsData decimalColumnStatsData) { + return new HiveDecimalStats( + fromThrift(decimalColumnStatsData.getLowValue()), + fromThrift(decimalColumnStatsData.getHighValue()), + decimalColumnStatsData.getNumNulls(), + decimalColumnStatsData.getNumDVs(), + ColumnStatsConverter.toBytes(decimalColumnStatsData.bufferForBitVectors()), + ColumnStatsConverter.toBytes(decimalColumnStatsData.bufferForHistogram())); + } + + private HiveSerializableDecimal fromThrift(Decimal decimal) { + return (decimal == null) ? null : new HiveSerializableDecimal( + decimal.getScale(), ColumnStatsConverter.toBytes(decimal.bufferForUnscaled())); + } + + public DecimalColumnStatsData toThrift(HiveDecimalStats decimalStats) { + DecimalColumnStatsData decimalColumnStatsData = + new DecimalColumnStatsData(decimalStats.numNulls(), decimalStats.numDVs()); + + if (decimalStats.lowValue() != null) { + decimalColumnStatsData.setLowValue(toThrift(decimalStats.lowValue())); + } + if (decimalStats.highValue() != null) { + decimalColumnStatsData.setHighValue(toThrift(decimalStats.highValue())); + } + decimalColumnStatsData.setBitVectors(decimalStats.bitVectors()); + decimalColumnStatsData.setHistogram(decimalStats.histogram()); + + return decimalColumnStatsData; + } + + private Decimal toThrift(HiveSerializableDecimal serializableDecimal) { + return new Decimal(serializableDecimal.scale(), ByteBuffer.wrap(serializableDecimal.unscaled())); + } +} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveDecimalStats.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveDecimalStats.java new file mode 100644 index 000000000000..93535a35acd3 --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveDecimalStats.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.mr.hive.stats; + +import java.io.Serial; + +public record HiveDecimalStats( + HiveSerializableDecimal lowValue, + HiveSerializableDecimal highValue, + long numNulls, + long numDVs, + byte[] bitVectors, + byte[] histogram +) implements HiveColumnStatisticsData { + @Serial + private static final long serialVersionUID = 1L; +} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveDoubleConverter.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveDoubleConverter.java new file mode 100644 index 000000000000..02f843b38e43 --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveDoubleConverter.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.mr.hive.stats; + +import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData; + +public class HiveDoubleConverter implements HiveTypeStatsConverter { + public HiveDoubleStats fromThrift(DoubleColumnStatsData doubleColumnStatsData) { + return new HiveDoubleStats( + doubleColumnStatsData.isSetLowValue() ? doubleColumnStatsData.getLowValue() : null, + doubleColumnStatsData.isSetHighValue() ? doubleColumnStatsData.getHighValue() : null, + doubleColumnStatsData.getNumNulls(), + doubleColumnStatsData.getNumDVs(), + ColumnStatsConverter.toBytes(doubleColumnStatsData.bufferForBitVectors()), + ColumnStatsConverter.toBytes(doubleColumnStatsData.bufferForHistogram())); + } + + public DoubleColumnStatsData toThrift(HiveDoubleStats doubleStats) { + DoubleColumnStatsData doubleColumnStatsData = + new DoubleColumnStatsData(doubleStats.numNulls(), doubleStats.numDVs()); + + if (doubleStats.lowValue() != null) { + doubleColumnStatsData.setLowValue(doubleStats.lowValue()); + } + if (doubleStats.highValue() != null) { + doubleColumnStatsData.setHighValue(doubleStats.highValue()); + } + doubleColumnStatsData.setBitVectors(doubleStats.bitVectors()); + doubleColumnStatsData.setHistogram(doubleStats.histogram()); + + return doubleColumnStatsData; + } +} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveDoubleStats.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveDoubleStats.java new file mode 100644 index 000000000000..8d9d1b15137f --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveDoubleStats.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.mr.hive.stats; + +import java.io.Serial; + +public record HiveDoubleStats( + Double lowValue, + Double highValue, + long numNulls, + long numDVs, + byte[] bitVectors, + byte[] histogram +) implements HiveColumnStatisticsData { + @Serial + private static final long serialVersionUID = 1L; +} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveLongConverter.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveLongConverter.java new file mode 100644 index 000000000000..4f4113f57e75 --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveLongConverter.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.mr.hive.stats; + +import org.apache.hadoop.hive.metastore.api.LongColumnStatsData; + +public class HiveLongConverter implements HiveTypeStatsConverter { + public HiveLongStats fromThrift(LongColumnStatsData longColumnStatsData) { + return new HiveLongStats( + longColumnStatsData.isSetLowValue() ? longColumnStatsData.getLowValue() : null, + longColumnStatsData.isSetHighValue() ? longColumnStatsData.getHighValue() : null, + longColumnStatsData.getNumNulls(), + longColumnStatsData.getNumDVs(), + ColumnStatsConverter.toBytes(longColumnStatsData.bufferForBitVectors()), + ColumnStatsConverter.toBytes(longColumnStatsData.bufferForHistogram())); + } + + public LongColumnStatsData toThrift(HiveLongStats longStats) { + LongColumnStatsData longColumnStatsData = new LongColumnStatsData(longStats.numNulls(), longStats.numDVs()); + if (longStats.lowValue() != null) { + longColumnStatsData.setLowValue(longStats.lowValue()); + } + if (longStats.highValue() != null) { + longColumnStatsData.setHighValue(longStats.highValue()); + } + longColumnStatsData.setBitVectors(longStats.bitVectors()); + longColumnStatsData.setHistogram(longStats.histogram()); + return longColumnStatsData; + } +} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveLongStats.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveLongStats.java new file mode 100644 index 000000000000..42776ccd3e2c --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveLongStats.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.mr.hive.stats; + +import java.io.Serial; + +public record HiveLongStats( + Long lowValue, + Long highValue, + long numNulls, + long numDVs, + byte[] bitVectors, + byte[] histogram +) implements HiveColumnStatisticsData { + @Serial + private static final long serialVersionUID = 1L; +} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveSerializableDate.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveSerializableDate.java new file mode 100644 index 000000000000..1a4932e7e898 --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveSerializableDate.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.mr.hive.stats; + +import java.io.Serial; +import java.io.Serializable; + +public record HiveSerializableDate(long daysSinceEpoch) implements Serializable { + @Serial + private static final long serialVersionUID = 1L; +} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveSerializableDecimal.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveSerializableDecimal.java new file mode 100644 index 000000000000..ee83922e07b0 --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveSerializableDecimal.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.mr.hive.stats; + +import java.io.Serial; +import java.io.Serializable; + +public record HiveSerializableDecimal(short scale, byte[] unscaled) implements Serializable { + @Serial + private static final long serialVersionUID = 1L; +} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveSerializableTimestamp.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveSerializableTimestamp.java new file mode 100644 index 000000000000..2863f6a341b3 --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveSerializableTimestamp.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.mr.hive.stats; + +import java.io.Serial; +import java.io.Serializable; + +public record HiveSerializableTimestamp(long secondsSinceEpoch) implements Serializable { + @Serial + private static final long serialVersionUID = 1L; +} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveStringConverter.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveStringConverter.java new file mode 100644 index 000000000000..aa645b6831c5 --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveStringConverter.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.mr.hive.stats; + +import org.apache.hadoop.hive.metastore.api.StringColumnStatsData; + +public class HiveStringConverter implements HiveTypeStatsConverter { + + public HiveStringStats fromThrift(StringColumnStatsData stringColumnStatsData) { + return new HiveStringStats( + stringColumnStatsData.getMaxColLen(), + stringColumnStatsData.getAvgColLen(), + stringColumnStatsData.getNumNulls(), + stringColumnStatsData.getNumDVs(), + ColumnStatsConverter.toBytes(stringColumnStatsData.bufferForBitVectors())); + } + + public StringColumnStatsData toThrift(HiveStringStats stringStats) { + StringColumnStatsData stringColumnStatsData = new StringColumnStatsData( + stringStats.maxColLen(), + stringStats.avgColLen(), + stringStats.numNulls(), + stringStats.numDVs()); + + stringColumnStatsData.setBitVectors(stringStats.bitVectors()); + + return stringColumnStatsData; + } +} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveStringStats.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveStringStats.java new file mode 100644 index 000000000000..da2dd52d0560 --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveStringStats.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.mr.hive.stats; + +import java.io.Serial; + +public record HiveStringStats( + long maxColLen, + double avgColLen, + long numNulls, + long numDVs, + byte[] bitVectors +) implements HiveColumnStatisticsData { + @Serial + private static final long serialVersionUID = 1L; +} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveTimestampConverter.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveTimestampConverter.java new file mode 100644 index 000000000000..c8da030600ca --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveTimestampConverter.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.mr.hive.stats; + +import org.apache.hadoop.hive.metastore.api.Timestamp; +import org.apache.hadoop.hive.metastore.api.TimestampColumnStatsData; + +public class HiveTimestampConverter implements HiveTypeStatsConverter { + + public HiveTimestampStats fromThrift(TimestampColumnStatsData timestampColumnStatsData) { + return new HiveTimestampStats( + fromThrift(timestampColumnStatsData.getLowValue()), + fromThrift(timestampColumnStatsData.getHighValue()), + timestampColumnStatsData.getNumNulls(), + timestampColumnStatsData.getNumDVs(), + ColumnStatsConverter.toBytes(timestampColumnStatsData.bufferForBitVectors()), + ColumnStatsConverter.toBytes(timestampColumnStatsData.bufferForHistogram())); + } + + private HiveSerializableTimestamp fromThrift(Timestamp timestamp) { + return (timestamp == null) ? null : new HiveSerializableTimestamp(timestamp.getSecondsSinceEpoch()); + } + + public TimestampColumnStatsData toThrift(HiveTimestampStats timestampStats) { + TimestampColumnStatsData timestampColumnStatsData = + new TimestampColumnStatsData(timestampStats.numNulls(), timestampStats.numDVs()); + + if (timestampStats.lowValue() != null) { + timestampColumnStatsData.setLowValue(toThrift(timestampStats.lowValue())); + } + + if (timestampStats.highValue() != null) { + timestampColumnStatsData.setHighValue(toThrift(timestampStats.highValue())); + } + + timestampColumnStatsData.setBitVectors(timestampStats.bitVectors()); + timestampColumnStatsData.setHistogram(timestampStats.histogram()); + + return timestampColumnStatsData; + } + + private Timestamp toThrift(HiveSerializableTimestamp serializableTimestamp) { + return new Timestamp(serializableTimestamp.secondsSinceEpoch()); + } +} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveTimestampStats.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveTimestampStats.java new file mode 100644 index 000000000000..2f490fdbe1f8 --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveTimestampStats.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.mr.hive.stats; + +import java.io.Serial; + +public record HiveTimestampStats( + HiveSerializableTimestamp lowValue, + HiveSerializableTimestamp highValue, + long numNulls, + long numDVs, + byte[] bitVectors, + byte[] histogram +) implements HiveColumnStatisticsData { + @Serial + private static final long serialVersionUID = 1L; +} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveTypeStatsConverter.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveTypeStatsConverter.java new file mode 100644 index 000000000000..d581f96358ed --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveTypeStatsConverter.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.iceberg.mr.hive.stats; + +public interface HiveTypeStatsConverter { + R fromThrift(T thrift); + T toThrift(R record); +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index 55f9d0c1e158..5c6ff48137b7 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -239,22 +239,29 @@ public static long getNumRows(HiveConf conf, List schema, Table tabl return aggregateStat.getNumRows(); } - private static void estimateStatsForMissingCols(List neededColumns, List columnStats, + private static List estimateStatsForMissingCols( + List neededColumns, List existingColStats, HiveConf conf, long nr, List schema) { Set neededCols = new HashSet<>(neededColumns); Set colsWithStats = new HashSet<>(); + List neededColStats = new ArrayList<>(neededCols.size()); - for (ColStatistics cstats : columnStats) { - colsWithStats.add(cstats.getColumnName()); + for (ColStatistics colStatistics : existingColStats) { + colsWithStats.add(colStatistics.getColumnName()); + if (neededCols.contains(colStatistics.getColumnName())) { + neededColStats.add(colStatistics); + } } List missingColStats = new ArrayList<>(Sets.difference(neededCols, colsWithStats)); if (!missingColStats.isEmpty()) { - columnStats.addAll( + neededColStats.addAll( estimateStats(schema, missingColStats, conf, nr)); } + + return neededColStats; } public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, @@ -300,7 +307,7 @@ private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList p if (needColStats && !metaTable) { colStats = getTableColumnStats(table, neededColumns, colStatsCache, fetchColStats); if (estimateStats) { - estimateStatsForMissingCols(neededColumns, colStats, conf, nr, schema); + colStats = estimateStatsForMissingCols(neededColumns, colStats, conf, nr, schema); } // we should have stats for all columns (estimated or actual) if (neededColumns.size() == colStats.size()) { @@ -386,7 +393,7 @@ private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList p boolean statsRetrieved = aggrStats != null && aggrStats.getColStats() != null && aggrStats.getColStatsSize() != 0; if (neededColumns.isEmpty() || (!neededColsToRetrieve.isEmpty() && !statsRetrieved)) { - estimateStatsForMissingCols(neededColsToRetrieve, columnStats, conf, nr, schema); + columnStats = estimateStatsForMissingCols(neededColsToRetrieve, columnStats, conf, nr, schema); // There are some partitions with no state (or we didn't fetch any state). // Update the stats with empty list to reflect that in the // state/initialize structures.