diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java index 37e91652fb88..a29e532b113b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java @@ -2286,6 +2286,10 @@ public static List getColumnNamesFromFieldSchema(List partC return names; } + public static List getColumnTypesFromFieldSchema(List fieldSchemas) { + return fieldSchemas.stream().map(FieldSchema::getType).toList(); + } + public static List getInternalColumnNamesFromSignature(List colInfos) { List names = new ArrayList(); for (ColumnInfo ci : colInfos) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java index 986dcb7fcbbb..f616049d8591 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql.parse; import java.io.IOException; +import java.io.Serializable; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Collection; @@ -1413,11 +1414,54 @@ public String toString() { } } + /** + * Holds table column {@link FieldSchema} entries and lazily derived parallel name/type string + * lists for analyze / column-stats compilation. + */ + public static final class FieldSchemas implements Serializable { + + private static final long serialVersionUID = 1L; + + private final List schemas; + + private transient List colNames; + private transient List colTypes; + + public FieldSchemas(List schemas) { + this.schemas = schemas != null ? schemas : Collections.emptyList(); + } + + public List getSchemas() { + return schemas; + } + + public int size() { + return schemas.size(); + } + + public FieldSchema get(int index) { + return schemas.get(index); + } + + public List getColName() { + if (colNames == null) { + colNames = Utilities.getColumnNamesFromFieldSchema(schemas); + } + return colNames; + } + + public List getColType() { + if (colTypes == null) { + colTypes = Utilities.getColumnTypesFromFieldSchema(schemas); + } + return colTypes; + } + } + public static class AnalyzeRewriteContext { private String tableName; - private List colName; - private List colType; + private FieldSchemas fieldSchemas; private boolean tblLvl; public String getTableName() { @@ -1428,12 +1472,12 @@ public void setTableName(String tableName) { this.tableName = tableName; } - public List getColName() { - return colName; + public FieldSchemas getFieldSchemas() { + return fieldSchemas; } - public void setColName(List colName) { - this.colName = colName; + public void setFieldSchemas(FieldSchemas fieldSchemas) { + this.fieldSchemas = fieldSchemas; } public boolean isTblLvl() { @@ -1444,14 +1488,6 @@ public void setTblLvl(boolean isTblLvl) { this.tblLvl = isTblLvl; } - public List getColType() { - return colType; - } - - public void setColType(List colType) { - this.colType = colType; - } - } /** diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java index ee80fc475299..e23e54aa5230 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java @@ -24,6 +24,7 @@ import com.google.common.base.Preconditions; import java.util.ArrayList; import java.util.Collections; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Objects; @@ -76,8 +77,7 @@ public class ColumnStatsSemanticAnalyzer extends SemanticAnalyzer { private boolean isRewritten; private boolean isTableLevel; - private List colNames; - private List colType; + private FieldSchemas rewrittenColumnSchemas; private Table tbl; public ColumnStatsSemanticAnalyzer(QueryState queryState) throws SemanticException { @@ -103,37 +103,36 @@ private boolean shouldRewrite(ASTNode tree) { } /** - * Get the names of the columns that support column statistics. + * Get the Field Schemas of the columns that support column statistics. */ - private static List getColumnNamesSupportingStats(Table tbl) { - List colNames = new ArrayList<>(); + private static FieldSchemas getStatsEligibleFieldSchemas(Table tbl) { + List result = new ArrayList<>(); for (FieldSchema col : tbl.getCols()) { String type = col.getType(); TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(type); boolean isSupported = ColumnStatsAutoGatherContext.isColumnSupported(typeInfo.getCategory(), () -> typeInfo); if (isSupported) { - colNames.add(col.getName()); + result.add(col); } } - return colNames; + return new FieldSchemas(result); } - private List getColumnName(ASTNode tree) throws SemanticException { - - switch (tree.getChildCount()) { - case 2: - return getColumnNamesSupportingStats(tbl); - case 3: - int numCols = tree.getChild(2).getChildCount(); - List colName = new ArrayList<>(numCols); - for (int i = 0; i < numCols; i++) { - colName.add(getUnescapedName((ASTNode) tree.getChild(2).getChild(i))); - } - return colName; - default: - throw new SemanticException("Internal error. Expected number of children of ASTNode to be" - + " either 2 or 3. Found : " + tree.getChildCount()); + private List getExplicitColumnNamesFromAst(ASTNode tree) throws SemanticException { + // The parser stores this statement as three pieces in order: which table (or partition) to + // analyze, a flag that this is column-level stats (not scanning the whole table for table + // stats alone), then the listed column names from "FOR COLUMNS (a, b, ...)". That layout is the reason + // we expect exactly three children and read the identifiers from the last one. + if (tree.getChildCount() != 3) { + throw new SemanticException("Internal error. Expected number of children of ASTNode should be 3. Found : " + + tree.getChildCount()); } + int numCols = tree.getChild(2).getChildCount(); + List colName = new ArrayList<>(numCols); + for (int i = 0; i < numCols; i++) { + colName.add(getUnescapedName((ASTNode) tree.getChild(2).getChild(i))); + } + return colName; } private void handlePartialPartitionSpec(Map partSpec, ColumnStatsAutoGatherContext context) throws @@ -217,34 +216,52 @@ private static String getColTypeOf(Table tbl, String partKey) { throw new RuntimeException("Unknown partition key : " + partKey); } - protected static List getColumnTypes(Table tbl, List colNames) { - List colTypes = new ArrayList<>(); - List cols = tbl.getCols(); - List copyColNames = new ArrayList<>(colNames); - - for (String colName : copyColNames) { - for (FieldSchema col : cols) { - if (colName.equalsIgnoreCase(col.getName())) { - String type = col.getType(); - TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(type); - boolean isSupported = ColumnStatsAutoGatherContext.isColumnSupported(typeInfo.getCategory(), () -> typeInfo); - if (!isSupported) { - logTypeWarning(colName, type); - colNames.remove(colName); - } else { - colTypes.add(type); - } - } + protected static List getFieldSchemasByColName(Table tbl, List colNames) + throws SemanticException { + Map specifiedColsMap = new HashMap<>(); + for (String colName : colNames) { + specifiedColsMap.put(colName.toLowerCase(), new FieldSchema(colName, null, null)); + } + + for (FieldSchema pk : tbl.getPartitionKeys()) { + FieldSchema fs = specifiedColsMap.get(pk.getName().toLowerCase()); + if (fs != null) { + throw new SemanticException(ErrorMsg.COLUMNSTATSCOLLECTOR_INVALID_COLUMN.getMsg() + + " [Try removing column '" + fs.getName() + "' from column list]"); } } - return colTypes; + for (FieldSchema col : tbl.getCols()) { + specifiedColsMap.computeIfPresent(col.getName().toLowerCase(), (key, value) -> col); + } + + List result = new ArrayList<>(); + List tableColNames = new FieldSchemas(tbl.getCols()).getColName(); + for (String colName : colNames) { + FieldSchema fs = specifiedColsMap.get(colName.toLowerCase()); + + // If the type is null, the column does not exist as its FieldSchema was not populated from tbl.getCols() + if (fs.getType() == null) { + String msg = "'" + colName + "' (possible columns are " + tableColNames + ")"; + throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg(msg)); + } + + String type = fs.getType(); + TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(type); + boolean isSupported = ColumnStatsAutoGatherContext.isColumnSupported(typeInfo.getCategory(), () -> typeInfo); + if (!isSupported) { + logTypeWarning(colName, type); + } else { + result.add(new FieldSchema(colName, type, fs.getComment())); + } + } + return result; } - private String genRewrittenQuery(List colNames, List colTypes, HiveConf conf, + private String genRewrittenQuery(FieldSchemas columnSchemas, HiveConf conf, List partTransformSpec, int specId, Map partSpec, boolean isPartitionStats) { - String rewritten = genRewrittenQuery(tbl, colNames, colTypes, conf, partTransformSpec, specId, partSpec, + String rewritten = genRewrittenQuery(tbl, columnSchemas, conf, partTransformSpec, specId, partSpec, isPartitionStats, false); isRewritten = true; return rewritten; @@ -257,28 +274,27 @@ private String genRewrittenQuery(List colNames, List colTypes, H protected static String genRewrittenQuery(Table tbl, HiveConf conf, List partTransformSpec, Map partSpec, boolean isPartitionStats) { - List colNames = getColumnNamesSupportingStats(tbl); - List colTypes = ColumnStatsSemanticAnalyzer.getColumnTypes(tbl, colNames); - return ColumnStatsSemanticAnalyzer.genRewrittenQuery( - tbl, colNames, colTypes, conf, partTransformSpec, -1, partSpec, isPartitionStats, true); + return ColumnStatsSemanticAnalyzer.genRewrittenQuery(tbl, getStatsEligibleFieldSchemas(tbl), conf, + partTransformSpec, -1, partSpec, isPartitionStats, true); } - private static String genRewrittenQuery(Table tbl, List colNames, List colTypes, + private static String genRewrittenQuery(Table tbl, FieldSchemas columnSchemas, HiveConf conf, List partTransformSpec, int specId, Map partSpec, boolean isPartitionStats, boolean useTableValues) { StringBuilder rewrittenQueryBuilder = new StringBuilder("select "); StringBuilder columnNamesBuilder = new StringBuilder(); StringBuilder columnDummyValuesBuilder = new StringBuilder(); - for (int i = 0; i < colNames.size(); i++) { + for (int i = 0; i < columnSchemas.size(); i++) { if (i > 0) { rewrittenQueryBuilder.append(", "); columnNamesBuilder.append(", "); columnDummyValuesBuilder.append(", "); } - final String columnName = unparseIdentifier(colNames.get(i), conf); - final TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(colTypes.get(i)); + FieldSchema columnSchema = columnSchemas.get(i); + final String columnName = unparseIdentifier(columnSchema.getName(), conf); + final TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(columnSchema.getType()); try { genComputeStats(rewrittenQueryBuilder, conf, i, columnName, typeInfo); @@ -586,31 +602,6 @@ private ASTNode genRewrittenTree(String rewrittenQuery) throws SemanticException } } - // fail early if the columns specified for column statistics are not valid - private void validateSpecifiedColumnNames(List specifiedCols) - throws SemanticException { - List tableCols = Utilities.getColumnNamesFromFieldSchema(tbl.getCols()); - for (String sc : specifiedCols) { - if (!tableCols.contains(sc.toLowerCase())) { - String msg = "'" + sc + "' (possible columns are " + tableCols + ")"; - throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg(msg)); - } - } - } - - private void checkForPartitionColumns(List specifiedCols, List partCols) - throws SemanticException { - // Raise error if user has specified partition column for stats - for (String pc : partCols) { - for (String sc : specifiedCols) { - if (pc.equalsIgnoreCase(sc)) { - throw new SemanticException(ErrorMsg.COLUMNSTATSCOLLECTOR_INVALID_COLUMN.getMsg() - + " [Try removing column '" + sc + "' from column list]"); - } - } - } - } - private static void logTypeWarning(String colName, String colType) { String warning = "Only primitive type arguments are accepted but " + colType + " is passed for " + colName + "."; @@ -634,8 +625,6 @@ public void analyze(ASTNode ast, Context origCtx) throws SemanticException { */ if (shouldRewrite(ast)) { tbl = AnalyzeCommandUtils.getTable(ast, this); - colNames = getColumnName(ast); - // Save away the original AST originalTree = ast; boolean isPartitionStats = AnalyzeCommandUtils.isPartitionLevelStats(ast) || StatsUtils.isPartitionStats(tbl, conf); @@ -643,9 +632,8 @@ public void analyze(ASTNode ast, Context origCtx) throws SemanticException { Map> partTransformSpecs = Collections.singletonMap(-1, null); Map partSpec = (isPartitionStats) ? AnalyzeCommandUtils.getPartKeyValuePairsFromAST(tbl, ast, conf) : null; - checkForPartitionColumns( - colNames, Utilities.getColumnNamesFromFieldSchema(tbl.getPartitionKeys())); - validateSpecifiedColumnNames(colNames); + + List columnSchemas = getColumnsFromAst(ast); if (isPartitionStats) { handlePartialPartitionSpec(partSpec, null); @@ -653,12 +641,12 @@ public void analyze(ASTNode ast, Context origCtx) throws SemanticException { partTransformSpecs = tbl.getStorageHandler().getPartitionTransformSpecs(tbl); } } - colType = getColumnTypes(tbl, colNames); + rewrittenColumnSchemas = new FieldSchemas(columnSchemas); isTableLevel = !isPartitionStats; rewrittenQuery = String.join(" union all ", Maps.transformEntries(partTransformSpecs, (specId, partTransformSpec) -> - genRewrittenQuery(colNames, colType, conf, partTransformSpec, specId, partSpec, isPartitionStats)) + genRewrittenQuery(rewrittenColumnSchemas, conf, partTransformSpec, specId, partSpec, isPartitionStats)) .values()); rewrittenTree = genRewrittenTree(rewrittenQuery); @@ -677,8 +665,7 @@ public void analyze(ASTNode ast, Context origCtx) throws SemanticException { analyzeRewrite = new AnalyzeRewriteContext(); analyzeRewrite.setTableName(tbl.getFullyQualifiedName()); analyzeRewrite.setTblLvl(isTableLevel); - analyzeRewrite.setColName(colNames); - analyzeRewrite.setColType(colType); + analyzeRewrite.setFieldSchemas(rewrittenColumnSchemas); qbp.setAnalyzeRewrite(analyzeRewrite); origCtx.addSubContext(ctx); initCtx(ctx); @@ -709,15 +696,13 @@ public ASTNode rewriteAST(ASTNode ast, ColumnStatsAutoGatherContext context) tbl = AnalyzeCommandUtils.getTable(ast, this); - colNames = getColumnName(ast); boolean isPartitionStats = AnalyzeCommandUtils.isPartitionLevelStats(ast) || StatsUtils.isPartitionStats(tbl, conf); List partTransformSpec = null; Map partSpec = null; - checkForPartitionColumns(colNames, - Utilities.getColumnNamesFromFieldSchema(tbl.getPartitionKeys())); - validateSpecifiedColumnNames(colNames); + + List columnSchemas = getColumnsFromAst(ast); if (isPartitionStats) { partSpec = AnalyzeCommandUtils.getPartKeyValuePairsFromAST(tbl, ast, conf); @@ -726,22 +711,35 @@ public ASTNode rewriteAST(ASTNode ast, ColumnStatsAutoGatherContext context) partTransformSpec = tbl.getStorageHandler().getPartitionTransformSpec(tbl); } } - colType = getColumnTypes(tbl, colNames); + rewrittenColumnSchemas = new FieldSchemas(columnSchemas); isTableLevel = !isPartitionStats; - rewrittenQuery = genRewrittenQuery(colNames, colType, conf, partTransformSpec, -1, + rewrittenQuery = genRewrittenQuery(rewrittenColumnSchemas, conf, partTransformSpec, -1, partSpec, isPartitionStats); rewrittenTree = genRewrittenTree(rewrittenQuery); return rewrittenTree; } + protected List getColumnsFromAst(ASTNode ast) throws SemanticException { + List statsEligibleFS = null; + List columnNames; + if (ast.getChildCount() == 2) { + FieldSchemas eligibleFS = getStatsEligibleFieldSchemas(tbl); + statsEligibleFS = eligibleFS.getSchemas(); + columnNames = eligibleFS.getColName(); + } else { + columnNames = getExplicitColumnNamesFromAst(ast); + } + + return statsEligibleFS != null ? statsEligibleFS : getFieldSchemasByColName(tbl, columnNames); + } + AnalyzeRewriteContext getAnalyzeRewriteContext() { AnalyzeRewriteContext analyzeRewrite = new AnalyzeRewriteContext(); analyzeRewrite.setTableName(tbl.getFullyQualifiedName()); analyzeRewrite.setTblLvl(isTableLevel); - analyzeRewrite.setColName(colNames); - analyzeRewrite.setColType(colType); + analyzeRewrite.setFieldSchemas(rewrittenColumnSchemas); return analyzeRewrite; } @@ -749,10 +747,7 @@ static AnalyzeRewriteContext genAnalyzeRewriteContext(HiveConf conf, Table tbl) AnalyzeRewriteContext analyzeRewrite = new AnalyzeRewriteContext(); analyzeRewrite.setTableName(tbl.getFullyQualifiedName()); analyzeRewrite.setTblLvl(!(conf.getBoolVar(ConfVars.HIVE_STATS_COLLECT_PART_LEVEL_STATS) && tbl.isPartitioned())); - List colNames = getColumnNamesSupportingStats(tbl); - List colTypes = getColumnTypes(tbl, colNames); - analyzeRewrite.setColName(colNames); - analyzeRewrite.setColType(colTypes); + analyzeRewrite.setFieldSchemas(getStatsEligibleFieldSchemas(tbl)); return analyzeRewrite; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java index 11dda4ef638e..d661102a104c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java @@ -68,6 +68,7 @@ import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.TableSpec; import org.apache.hadoop.hive.ql.plan.BasicStatsWork; import org.apache.hadoop.hive.ql.plan.ColumnStatsDesc; +import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.FieldSchemas; import org.apache.hadoop.hive.ql.plan.FetchWork; import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.LoadFileDesc; @@ -672,8 +673,7 @@ protected void genColumnStatsTask(AnalyzeRewriteContext analyzeRewrite, int outerQueryLimit, int numBitVector) throws SemanticException { FetchWork fetch; String tableName = analyzeRewrite.getTableName(); - List colName = analyzeRewrite.getColName(); - List colType = analyzeRewrite.getColType(); + FieldSchemas fieldSchemas = analyzeRewrite.getFieldSchemas(); boolean isTblLevel = analyzeRewrite.isTblLvl(); String cols = loadFileWork.get(0).getColumns(); @@ -691,7 +691,7 @@ protected void genColumnStatsTask(AnalyzeRewriteContext analyzeRewrite, fetch = new FetchWork(loadFileWork.get(0).getSourcePath(), resultTab, outerQueryLimit); ColumnStatsDesc cStatsDesc = new ColumnStatsDesc(tableName, - colName, colType, isTblLevel, numBitVector, fetch); + fieldSchemas, isTblLevel, numBitVector, fetch); StatsTask columnStatsTask = map.get(tableName); if (columnStatsTask == null) { throw new SemanticException("Can not find " + tableName + " in genColumnStatsTask"); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/ColumnStatsDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/ColumnStatsDesc.java index 9a90aa2633a6..b5993f395014 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/ColumnStatsDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/ColumnStatsDesc.java @@ -20,6 +20,7 @@ import java.io.Serializable; import java.util.List; +import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.FieldSchemas; import org.apache.hadoop.hive.ql.plan.Explain.Level; /** @@ -34,15 +35,12 @@ public class ColumnStatsDesc implements Serializable, Cloneable { private int numBitVector; private boolean needMerge; private String tableName; - private List colName; - private List colType; + private FieldSchemas columnSchemas; - - public ColumnStatsDesc(String tableName, List colName, - List colType, boolean isTblLevel, int numBitVector, FetchWork fWork1) { + public ColumnStatsDesc(String tableName, FieldSchemas columnSchemas, boolean isTblLevel, + int numBitVector, FetchWork fWork1) { this.tableName = tableName; - this.colName = colName; - this.colType = colType; + this.columnSchemas = columnSchemas; this.isTblLevel = isTblLevel; this.numBitVector = numBitVector; this.needMerge = this.numBitVector != 0; @@ -69,20 +67,12 @@ public void setTblLevel(boolean isTblLevel) { @Explain(displayName = "Columns") public List getColName() { - return colName; - } - - public void setColName(List colName) { - this.colName = colName; + return columnSchemas == null ? null : columnSchemas.getColName(); } @Explain(displayName = "Column Types") public List getColType() { - return colType; - } - - public void setColType(List colType) { - this.colType = colType; + return columnSchemas == null ? null : columnSchemas.getColType(); } public int getNumBitVector() {