package com.amazonaws.glue.catalog.util;

import com.amazonaws.glue.shims.ShimsLoader;

import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNot;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.log4j.Logger;

import java.util.LinkedList;
import java.util.List;
import java.util.Set;

 * Utility methods for constructing the string representation of query expressions used by Catalog service
public final class ExpressionHelper {

  private final static String HIVE_STRING_TYPE_NAME = "string";
  private final static String HIVE_IN_OPERATOR = "IN";
  private final static String HIVE_NOT_IN_OPERATOR = "NOT IN";
  private final static String HIVE_NOT_OPERATOR = "not";

  // TODO "hook" into Hive logging (hive or hive.metastore)
  private final static Logger logger = Logger.getLogger(ExpressionHelper.class);

  private final static List<String> QUOTED_TYPES = ImmutableList.of("string", "char", "varchar", "date", "datetime", "timestamp");
  private final static Joiner JOINER = Joiner.on(" AND ");

  * The method below is used to rewrite the hive expression tree to quote the timestamp values.
  * An example of this would be hive providing us as query as follows:
  * ((strCol = 'test') and (timestamp = 1969-12-31 16:02:03.456))
  * this will be rewritten by the method to:
  * ((strCol = 'test') and (timestamp = '1969-12-31 16:02:03.456'))
  * Notice the way the timestamp is quoted.
  * In order to perform this operation we recursively navigate the ExpressionTree
  * given to us by hive and switch the type to 'string' whenever we encounter a node type
  * of type 'timestamp'
  * When we call the getExprTree method of the modified expression tree, the timestamp values are
  * properly quoted.
  * This method also rewrites the expression string for "NOT IN" expression.
  * Hive converts the expression "<colName> NOT IN (<List of values>)"  to "(not (<colName>) IN (<List of values>))".
  * But in DataCatalog service, the parsing is done based on the original expression (which contains NOT IN).
  * So, we need to rewrite the expression if NOT IN was used.
  * */
  public static String convertHiveExpressionToCatalogExpression(byte[] exprBytes) throws MetaException {
    ExprNodeGenericFuncDesc exprTree = deserializeExpr(exprBytes);
    Set<String> columnNamesInNotInExpression = Sets.newHashSet();
    fieldEscaper(exprTree.getChildren(), exprTree, columnNamesInNotInExpression);
    String expression = rewriteExpressionForNotIn(exprTree.getExprString(), columnNamesInNotInExpression);
    return expression;

  private static ExprNodeGenericFuncDesc deserializeExpr(byte[] exprBytes) throws MetaException {
    ExprNodeGenericFuncDesc expr = null;
    try {
      expr = ShimsLoader.getHiveShims().getDeserializeExpression(exprBytes);
    } catch (Exception ex) {
      logger.error("Failed to deserialize the expression", ex);
      throw new MetaException(ex.getMessage());
    if (expr == null) {
      throw new MetaException("Failed to deserialize expression - ExprNodeDesc not present");
    return expr;

  //Helper method that recursively switches the type of the node, this is used
  //by the convertHiveExpressionToCatalogExpression
  private static void fieldEscaper(List<ExprNodeDesc> exprNodes, ExprNodeDesc parent, Set<String> columnNamesInNotInExpression) {
    if (exprNodes == null || exprNodes.isEmpty()) {
    } else {
      for (ExprNodeDesc nodeDesc : exprNodes) {
        String nodeType = nodeDesc.getTypeString().toLowerCase();
        if (QUOTED_TYPES.contains(nodeType)) {
          PrimitiveTypeInfo tInfo = new PrimitiveTypeInfo();
        addColumnNamesOfNotInExpressionToSet(nodeDesc, parent, columnNamesInNotInExpression);
        fieldEscaper(nodeDesc.getChildren(), nodeDesc, columnNamesInNotInExpression);

   * Method to extract the names of columns that are involved in NOT IN expression. Only one column is allowed to be
   * used in NOT IN expression. So, ExprNodeDesc.getCols() would return only 1 column.
   * @param childNode
   * @param parentNode
   * @param columnsInNotInExpression
  private static void addColumnNamesOfNotInExpressionToSet(ExprNodeDesc childNode, ExprNodeDesc parentNode, Set<String> columnsInNotInExpression) {
    if (parentNode != null && childNode != null && parentNode instanceof ExprNodeGenericFuncDesc && childNode instanceof ExprNodeGenericFuncDesc) {
      ExprNodeGenericFuncDesc parentFuncNode = (ExprNodeGenericFuncDesc) parentNode;
      ExprNodeGenericFuncDesc childFuncNode = (ExprNodeGenericFuncDesc) childNode;
      if(parentFuncNode.getGenericUDF() instanceof GenericUDFOPNot && childFuncNode.getGenericUDF() instanceof GenericUDFIn) {
        // The current parent child pair represents a "NOT IN" expression. Add name of the column to the set.

  private static String rewriteExpressionForNotIn(String expression, Set<String> columnsInNotInExpression){
    for (String columnName : columnsInNotInExpression) {
      if (columnName != null) {
        String hiveExpression = getHiveCompatibleNotInExpression(columnName);
        hiveExpression = escapeParentheses(hiveExpression);
        String catalogExpression = getCatalogCompatibleNotInExpression(columnName);
        catalogExpression = escapeParentheses(catalogExpression);
        expression = expression.replaceAll(hiveExpression, catalogExpression);
    return expression;

  // return "not (<columnName>) IN ("
  private static String getHiveCompatibleNotInExpression(String columnName) {
    return String.format("%s (%s) %s (", HIVE_NOT_OPERATOR, columnName, HIVE_IN_OPERATOR);

  // return "(<columnName>) NOT IN ("
  private static String getCatalogCompatibleNotInExpression(String columnName) {
    return String.format("(%s) %s (", columnName, HIVE_NOT_IN_OPERATOR);

   * Escape the parentheses so that they are considered literally and not as part of regular expression. In the updated
   * expression , we need "\\(" as the output. So, the first four '\' generate '\\' and the last two '\' generate a '('
  private static String escapeParentheses(String expression) {
    expression = expression.replaceAll("\\(", "\\\\\\(");
    expression = expression.replaceAll("\\)", "\\\\\\)");
    return expression;

  public static String buildExpressionFromPartialSpecification(org.apache.hadoop.hive.metastore.api.Table table,
          List<String> partitionValues) throws MetaException {

    List<org.apache.hadoop.hive.metastore.api.FieldSchema> partitionKeys = table.getPartitionKeys();

    if (partitionValues == null || partitionValues.isEmpty() ) {
      return null;

    if (partitionKeys == null || partitionValues.size() > partitionKeys.size()) {
      throw new MetaException("Incorrect number of partition values: " + partitionValues);

    partitionKeys = partitionKeys.subList(0, partitionValues.size());
    List<String> predicates = new LinkedList<>();
    for (int i = 0; i < partitionValues.size(); i++) {
      if (!Strings.isNullOrEmpty(partitionValues.get(i))) {
        predicates.add(buildPredicate(partitionKeys.get(i), partitionValues.get(i)));

    return JOINER.join(predicates);

  private static String buildPredicate(org.apache.hadoop.hive.metastore.api.FieldSchema schema, String value) {
    if (isQuotedType(schema.getType())) {
      return String.format("(%s='%s')", schema.getName(), escapeSingleQuotes(value));
    } else {
      return String.format("(%s=%s)", schema.getName(), value);

  private static String escapeSingleQuotes(String s) {
    return s.replaceAll("'", "\\\\'");

  private static boolean isQuotedType(String type) {
    return QUOTED_TYPES.contains(type);

  public static String replaceDoubleQuoteWithSingleQuotes(String s) {
    return s.replaceAll("\"", "\'");
