Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,9 @@
import org.apache.iceberg.mr.InputFormatConfig;
import org.apache.iceberg.mr.hive.actions.HiveIcebergDeleteOrphanFiles;
import org.apache.iceberg.mr.hive.plan.IcebergBucketFunction;
import org.apache.iceberg.mr.hive.stats.ColumnStatsConverter;
import org.apache.iceberg.mr.hive.stats.HiveColumnStatistics;
import org.apache.iceberg.mr.hive.stats.HiveColumnStatisticsObj;
import org.apache.iceberg.mr.hive.udf.GenericUDFIcebergZorder;
import org.apache.iceberg.puffin.Blob;
import org.apache.iceberg.puffin.BlobMetadata;
Expand Down Expand Up @@ -644,8 +647,12 @@ private boolean writeColStats(List<ColumnStatistics> colStats, Table tbl) {
Map<String, String> properties = isTblLevel ? Map.of() :
Map.of(PARTITION, String.valueOf(stats.getStatsDesc().getPartName()));

List<? extends Serializable> statsObjects = isTblLevel ?
stats.getStatsObj() : List.of(stats);
List<? extends Serializable> statsObjects;
if (isTblLevel) {
statsObjects = stats.getStatsObj().stream().map(ColumnStatsConverter::fromThrift).toList();
} else {
statsObjects = List.of(ColumnStatsConverter.fromThrift(stats));
}

List<Integer> fieldIds = null;

Expand All @@ -664,11 +671,11 @@ private boolean writeColStats(List<ColumnStatistics> colStats, Table tbl) {

if (isTblLevel) {
fieldIds = List.of(schema.findField(
((ColumnStatisticsObj) statsObj).getColName()).fieldId());
((HiveColumnStatisticsObj) statsObj).colName()).fieldId());
}

writer.add(new Blob(
ColumnStatisticsObj.class.getSimpleName(),
HiveColumnStatisticsObj.class.getSimpleName(),
fieldIds,
snapshotId,
snapshotSequenceNumber,
Expand Down Expand Up @@ -753,7 +760,9 @@ public List<ColumnStatisticsObj> getColStatistics(org.apache.hadoop.hive.ql.meta
filter = null;
}

return IcebergTableUtil.readColStats(table, snapshot.snapshotId(), filter);
List<HiveColumnStatisticsObj> columnStatisticsObjList =
IcebergTableUtil.readColStats(table, snapshot.snapshotId(), filter);
return columnStatisticsObjList.stream().map(ColumnStatsConverter::toThrift).toList();
}

@Override
Expand All @@ -773,7 +782,9 @@ public AggrStats getAggrColStatsFor(org.apache.hadoop.hive.ql.metadata.Table hms
Set<String> partitions = Sets.newHashSet(partNames);
Predicate<BlobMetadata> filter = metadata -> partitions.contains(metadata.properties().get(PARTITION));

List<ColumnStatistics> partStats = IcebergTableUtil.readColStats(table, snapshot.snapshotId(), filter);
List<HiveColumnStatistics> storedStats =
IcebergTableUtil.readColStats(table, snapshot.snapshotId(), filter);
List<ColumnStatistics> partStats = storedStats.stream().map(ColumnStatsConverter::toThrift).toList();

partStats.forEach(colStats ->
colStats.getStatsObj().removeIf(statsObj -> !colNames.contains(statsObj.getColName())));
Expand Down Expand Up @@ -831,13 +842,17 @@ private void checkAndMergeColStats(List<ColumnStatistics> statsNew, Table tbl) t
boolean isTblLevel = statsNew.getFirst().getStatsDesc().isIsTblLevel();
Map<String, ColumnStatistics> oldStatsMap = Maps.newHashMap();

List<?> statsOld = IcebergTableUtil.readColStats(tbl, previousSnapshotId, null);

List<?> statsOld;
if (!isTblLevel) {
List<HiveColumnStatistics> storedStats = IcebergTableUtil.readColStats(tbl, previousSnapshotId, null);
statsOld = storedStats.stream().map(ColumnStatsConverter::toThrift).toList();

for (ColumnStatistics statsObjOld : (List<ColumnStatistics>) statsOld) {
oldStatsMap.put(statsObjOld.getStatsDesc().getPartName(), statsObjOld);
}
} else {
List<HiveColumnStatisticsObj> storedStatObjects = IcebergTableUtil.readColStats(tbl, previousSnapshotId, null);
statsOld = storedStatObjects.stream().map(ColumnStatsConverter::toThrift).toList();
statsOld = Collections.singletonList(
new ColumnStatistics(null, (List<ColumnStatisticsObj>) statsOld));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
import org.apache.hadoop.hive.metastore.Warehouse;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
import org.apache.hadoop.hive.metastore.utils.TableFetcher;
Expand Down Expand Up @@ -101,6 +100,7 @@
import org.apache.iceberg.io.FileIO;
import org.apache.iceberg.mr.Catalogs;
import org.apache.iceberg.mr.InputFormatConfig;
import org.apache.iceberg.mr.hive.stats.HiveColumnStatisticsObj;
import org.apache.iceberg.puffin.BlobMetadata;
import org.apache.iceberg.puffin.Puffin;
import org.apache.iceberg.puffin.PuffinReader;
Expand Down Expand Up @@ -256,7 +256,7 @@ static String getColStatsPath(Table table, long snapshotId) {
return table.statisticsFiles().stream()
.filter(stats -> stats.snapshotId() == snapshotId)
.filter(stats -> stats.blobMetadata().stream()
.anyMatch(metadata -> ColumnStatisticsObj.class.getSimpleName().equals(metadata.type()))
.anyMatch(metadata -> HiveColumnStatisticsObj.class.getSimpleName().equals(metadata.type()))
)
.map(StatisticsFile::path)
.findAny().orElse(null);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.iceberg.mr.hive.stats;

import java.nio.ByteBuffer;
import java.util.stream.Collectors;
import org.apache.hadoop.hive.metastore.api.ColumnStatistics;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;

public final class ColumnStatsConverter {
private static final HiveBooleanConverter BOOLEAN = new HiveBooleanConverter();
private static final HiveLongConverter LONG = new HiveLongConverter();
private static final HiveDoubleConverter DOUBLE = new HiveDoubleConverter();
private static final HiveStringConverter STRING = new HiveStringConverter();
private static final HiveBinaryConverter BINARY = new HiveBinaryConverter();
private static final HiveDecimalConverter DECIMAL = new HiveDecimalConverter();
private static final HiveDateConverter DATE = new HiveDateConverter();
private static final HiveTimestampConverter TIMESTAMP = new HiveTimestampConverter();

private ColumnStatsConverter() {

}

public static HiveColumnStatistics fromThrift(ColumnStatistics columnStatistics) {
if (columnStatistics == null) {
return null;
}

ColumnStatisticsDesc tDesc = columnStatistics.getStatsDesc();
return new HiveColumnStatistics(
new HiveColumnStatisticsDesc(tDesc.isIsTblLevel(), tDesc.getDbName(), tDesc.getTableName(),
tDesc.getPartName(), tDesc.isSetLastAnalyzed() ? tDesc.getLastAnalyzed() : null, tDesc.getCatName()),
columnStatistics.getStatsObj().stream().map(ColumnStatsConverter::fromThrift).collect(Collectors.toList()),

Check warning on line 51 in iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/ColumnStatsConverter.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Replace this usage of 'Stream.collect(Collectors.toList())' with 'Stream.toList()' and ensure that the list is unmodified.

See more on https://sonarcloud.io/project/issues?id=apache_hive&issues=AZ4r127qPVTzyL349e-p&open=AZ4r127qPVTzyL349e-p&pullRequest=6491
columnStatistics.isSetIsStatsCompliant() ? columnStatistics.isIsStatsCompliant() : null,
columnStatistics.getEngine()
);
}

public static HiveColumnStatisticsObj fromThrift(ColumnStatisticsObj columnStatisticsObj) {
return new HiveColumnStatisticsObj(
columnStatisticsObj.getColName(),
columnStatisticsObj.getColType(),
fromThriftData(columnStatisticsObj.getStatsData()));
}

private static HiveColumnStatisticsData fromThriftData(ColumnStatisticsData columnStatisticsData) {
if (columnStatisticsData.isSetBooleanStats()) {
return BOOLEAN.fromThrift(columnStatisticsData.getBooleanStats());
}

if (columnStatisticsData.isSetLongStats()) {
return LONG.fromThrift(columnStatisticsData.getLongStats());
}

if (columnStatisticsData.isSetDoubleStats()) {
return DOUBLE.fromThrift(columnStatisticsData.getDoubleStats());
}

if (columnStatisticsData.isSetStringStats()) {
return STRING.fromThrift(columnStatisticsData.getStringStats());
}

if (columnStatisticsData.isSetBinaryStats()) {
return BINARY.fromThrift(columnStatisticsData.getBinaryStats());
}

if (columnStatisticsData.isSetDecimalStats()) {
return DECIMAL.fromThrift(columnStatisticsData.getDecimalStats());
}

if (columnStatisticsData.isSetDateStats()) {
return DATE.fromThrift(columnStatisticsData.getDateStats());
}

if (columnStatisticsData.isSetTimestampStats()) {
return TIMESTAMP.fromThrift(columnStatisticsData.getTimestampStats());
}

throw new UnsupportedOperationException("Unsupported Hive Stat Type");
}

public static ColumnStatistics toThrift(HiveColumnStatistics record) {

Check warning on line 100 in iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/ColumnStatsConverter.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Rename this variable to not match a restricted identifier.

See more on https://sonarcloud.io/project/issues?id=apache_hive&issues=AZ4r127qPVTzyL349e-q&open=AZ4r127qPVTzyL349e-q&pullRequest=6491
if (record == null) {
return null;
}

ColumnStatistics result = new ColumnStatistics();

if (record.statsDesc() != null) {
result.setStatsDesc(toThrift(record.statsDesc()));
}

if (record.statsObj() != null) {
result.setStatsObj(record.statsObj().stream()
.map(ColumnStatsConverter::toThrift)
.collect(java.util.stream.Collectors.toList()));

Check warning on line 114 in iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/ColumnStatsConverter.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Replace this usage of 'Stream.collect(Collectors.toList())' with 'Stream.toList()' and ensure that the list is unmodified.

See more on https://sonarcloud.io/project/issues?id=apache_hive&issues=AZ4r127qPVTzyL349e-r&open=AZ4r127qPVTzyL349e-r&pullRequest=6491
}

if (record.isStatsCompliant() != null) {
result.setIsStatsCompliant(record.isStatsCompliant());
}

result.setEngine(record.engine());

return result;
}

private static ColumnStatisticsDesc toThrift(HiveColumnStatisticsDesc columnStatisticsDesc) {
var result = new ColumnStatisticsDesc(
columnStatisticsDesc.isTblLevel(),
columnStatisticsDesc.dbName(),
columnStatisticsDesc.tableName());

if (columnStatisticsDesc.partName() != null) {
result.setPartName(columnStatisticsDesc.partName());
}
if (columnStatisticsDesc.lastAnalyzed() != null) {
result.setLastAnalyzed(columnStatisticsDesc.lastAnalyzed());
}
if (columnStatisticsDesc.catName() != null) {
result.setCatName(columnStatisticsDesc.catName());
}

return result;
}

public static ColumnStatisticsObj toThrift(HiveColumnStatisticsObj columnStatisticsObj) {
return new ColumnStatisticsObj(
columnStatisticsObj.colName(),
columnStatisticsObj.colType(),
toThrift(columnStatisticsObj.statsData())
);
}

private static ColumnStatisticsData toThrift(HiveColumnStatisticsData columnStatisticsData) {
ColumnStatisticsData result = new ColumnStatisticsData();
switch (columnStatisticsData) {
case HiveBooleanStats statsValue -> result.setBooleanStats(BOOLEAN.toThrift(statsValue));
case HiveLongStats statsValue -> result.setLongStats(LONG.toThrift(statsValue));
case HiveDoubleStats statsValue -> result.setDoubleStats(DOUBLE.toThrift(statsValue));
case HiveStringStats statsValue -> result.setStringStats(STRING.toThrift(statsValue));
case HiveBinaryStats statsValue -> result.setBinaryStats(BINARY.toThrift(statsValue));
case HiveDecimalStats statsValue -> result.setDecimalStats(DECIMAL.toThrift(statsValue));
case HiveDateStats statsValue -> result.setDateStats(DATE.toThrift(statsValue));
case HiveTimestampStats statsValue -> result.setTimestampStats(TIMESTAMP.toThrift(statsValue));
}
return result;
}

public static byte[] toBytes(ByteBuffer byteBuffer) {
if (byteBuffer == null) {
return null;

Check warning on line 170 in iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/ColumnStatsConverter.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Return an empty array instead of null.

See more on https://sonarcloud.io/project/issues?id=apache_hive&issues=AZ4r127qPVTzyL349e-s&open=AZ4r127qPVTzyL349e-s&pullRequest=6491
}

byte[] arr = new byte[byteBuffer.remaining()];
byteBuffer.duplicate().get(arr);
return arr;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.iceberg.mr.hive.stats;

import org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData;

public class HiveBinaryConverter implements HiveTypeStatsConverter<BinaryColumnStatsData, HiveBinaryStats> {

public HiveBinaryStats fromThrift(BinaryColumnStatsData binaryColumnStatsData) {
return new HiveBinaryStats(
binaryColumnStatsData.getMaxColLen(),
binaryColumnStatsData.getAvgColLen(),
binaryColumnStatsData.getNumNulls(),
ColumnStatsConverter.toBytes(binaryColumnStatsData.bufferForBitVectors()));
}

public BinaryColumnStatsData toThrift(HiveBinaryStats binaryStats) {
BinaryColumnStatsData binaryColumnStatsData =
new BinaryColumnStatsData(binaryStats.maxColLen(), binaryStats.avgColLen(), binaryStats.numNulls());

binaryColumnStatsData.setBitVectors(binaryStats.bitVectors());

return binaryColumnStatsData;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.iceberg.mr.hive.stats;

import java.io.Serial;

public record HiveBinaryStats(

Check warning on line 23 in iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/stats/HiveBinaryStats.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Override equals, hashCode and toString to consider array's content in the method

See more on https://sonarcloud.io/project/issues?id=apache_hive&issues=AZ4r1278PVTzyL349e-u&open=AZ4r1278PVTzyL349e-u&pullRequest=6491
long maxColLen,
double avgColLen,
long numNulls,
byte[] bitVectors
) implements HiveColumnStatisticsData {
@Serial
private static final long serialVersionUID = 1L;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.iceberg.mr.hive.stats;

import org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData;

public class HiveBooleanConverter implements HiveTypeStatsConverter<BooleanColumnStatsData, HiveBooleanStats> {
public HiveBooleanStats fromThrift(BooleanColumnStatsData booleanColumnStatsData) {
return new HiveBooleanStats(
booleanColumnStatsData.getNumTrues(),
booleanColumnStatsData.getNumFalses(),
booleanColumnStatsData.getNumNulls(),
ColumnStatsConverter.toBytes(booleanColumnStatsData.bufferForBitVectors()));
}

public BooleanColumnStatsData toThrift(HiveBooleanStats booleanStats) {
BooleanColumnStatsData booleanColumnStatsData = new BooleanColumnStatsData(
booleanStats.numTrues(),
booleanStats.numFalses(),
booleanStats.numNulls());
booleanColumnStatsData.setBitVectors(booleanStats.bitVectors());
return booleanColumnStatsData;
}
}
Loading
Loading