* Copyright 2013 Cloudera Inc.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *   http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.

package org.kitesdk.data.spi.hive;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import javax.annotation.Nullable;
import org.apache.avro.Schema;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo;
import org.codehaus.jackson.node.NullNode;
import org.kitesdk.compat.DynConstructors;
import org.kitesdk.compat.DynMethods;
import org.kitesdk.data.PartitionStrategy;
import org.kitesdk.data.impl.Accessor;
import org.kitesdk.data.spi.FieldPartitioner;
import org.kitesdk.data.spi.SchemaUtil;

public class HiveSchemaConverter {

  private static final DynMethods.StaticMethod primitiveTypeForName =
      new DynMethods.Builder("getPrimitiveTypeInfo")
          .impl(TypeInfoFactory.class, String.class)

  private static final DynMethods.StaticMethod parseTypeInfo =
      new DynMethods.Builder("getTypeInfoFromTypeString")
          .impl(TypeInfoUtils.class, String.class)

  private static Class<?> findTypeInfoClass(String className) {
    try {
      return new DynConstructors.Builder(TypeInfo.class)
    } catch (NoSuchMethodException e) {
      return null;

  // TypeInfo classes that may not be present at runtime
  static final Class<?> charClass = findTypeInfoClass(
  static final Class<?> varcharClass = findTypeInfoClass(
  static final Class<?> decimalClass = findTypeInfoClass(

  private static final ImmutableMap<String, Schema.Type> TYPEINFO_TO_TYPE =
      ImmutableMap.<String, Schema.Type>builder()
          .put("boolean", Schema.Type.BOOLEAN)
          .put("tinyint", Schema.Type.INT)
          .put("smallint", Schema.Type.INT)
          .put("int", Schema.Type.INT)
          .put("bigint", Schema.Type.LONG)
          .put("float", Schema.Type.FLOAT)
          .put("double", Schema.Type.DOUBLE)
          .put("string", Schema.Type.STRING)
          .put("binary", Schema.Type.BYTES)
          .put("void", Schema.Type.NULL) // void columns are placeholders

  static final ImmutableMap<Schema.Type, TypeInfo> TYPE_TO_TYPEINFO =
      ImmutableMap.<Schema.Type, TypeInfo>builder()
          .put(Schema.Type.BOOLEAN, primitiveTypeInfo("boolean"))
          .put(Schema.Type.INT, primitiveTypeInfo("int"))
          .put(Schema.Type.LONG, primitiveTypeInfo("bigint"))
          .put(Schema.Type.FLOAT, primitiveTypeInfo("float"))
          .put(Schema.Type.DOUBLE, primitiveTypeInfo("double"))
          .put(Schema.Type.STRING, primitiveTypeInfo("string"))
          .put(Schema.Type.ENUM, primitiveTypeInfo("string"))
          .put(Schema.Type.BYTES, primitiveTypeInfo("binary"))
          .put(Schema.Type.FIXED, primitiveTypeInfo("binary"))

  private static final Schema NULL = Schema.create(Schema.Type.NULL);
  static final NullNode NULL_DEFAULT = NullNode.getInstance();
  static final Collection<String[]> NO_REQUIRED_FIELDS = ImmutableList.of();

  private static TypeInfo primitiveTypeInfo(String type) {
    return primitiveTypeForName.invoke(type);

  public static TypeInfo parseTypeInfo(String type) {
    return parseTypeInfo.invoke(type);

  public static Schema convertTable(String table, Collection<FieldSchema> columns,
                                    @Nullable PartitionStrategy strategy) {
    ArrayList<String> fieldNames = Lists.newArrayList();
    ArrayList<TypeInfo> fieldTypes = Lists.newArrayList();
    LinkedList<String> start = Lists.newLinkedList();
    Collection<String[]> requiredFields = requiredFields(strategy);

    List<Schema.Field> fields = Lists.newArrayList();
    for (FieldSchema column : columns) {
      // pass null for the initial path to exclude the table name
      TypeInfo type = parseTypeInfo(column.getType());
      fields.add(convertField(start, column.getName(), type, requiredFields));

    StructTypeInfo struct = new StructTypeInfo();

    Schema recordSchema = Schema.createRecord(table, doc(struct), null, false);

    return recordSchema;

  private static Schema convert(LinkedList<String> path, String name,
                                StructTypeInfo type,
                                Collection<String[]> required) {
    List<String> names = type.getAllStructFieldNames();
    List<TypeInfo> types = type.getAllStructFieldTypeInfos();
    Preconditions.checkArgument(names.size() == types.size(),
        "Cannot convert struct: %s names != %s types",
        names.size(), types.size());

    List<Schema.Field> fields = Lists.newArrayList();
    for (int i = 0; i < names.size(); i += 1) {
      fields.add(convertField(path, names.get(i), types.get(i), required));

    Schema recordSchema = Schema.createRecord(name, doc(type), null, false);

    return recordSchema;

  private static Schema.Field convertField(LinkedList<String> path, String name,
                                           TypeInfo type,
                                           Collection<String[]> required) {
    // filter the required fields with the current name
    Collection<String[]> matchingRequired = filterByStartsWith(required, path, name);

    Schema schema = convert(path, name, type, matchingRequired);
    boolean isOptional = (schema.getType() == Schema.Type.UNION);

    if (!isOptional && matchingRequired.size() < 1) {
      // not already an optional union and not required, make it optional.
      // this doesn't complain if a required field is already optional because
      // the minimum required fields are validated by DatasetDescriptor.
      schema  = optional(schema);
      isOptional = true;

    return new Schema.Field(name, schema, doc(type),
        isOptional ? NULL_DEFAULT : null);

  static Schema convert(LinkedList<String> path, String name,
                        TypeInfo type, Collection<String[]> required) {
    switch (type.getCategory()) {
      case PRIMITIVE:
        if (type.getClass() == charClass || type.getClass() == varcharClass) {
          // this is required because type name includes length
          return Schema.create(Schema.Type.STRING);

        String typeInfoName = type.getTypeName();
            "Cannot convert unsupported type: %s", typeInfoName);
        return Schema.create(TYPEINFO_TO_TYPE.get(typeInfoName));

      case LIST:
        return Schema.createArray(optional(convert(path, name,
            ((ListTypeInfo) type).getListElementTypeInfo(), required)));

      case MAP:
        MapTypeInfo mapType = (MapTypeInfo) type;
            "Non-String map key type: %s", mapType.getMapKeyTypeInfo());

        return Schema.createMap(optional(convert(path, name,
            mapType.getMapValueTypeInfo(), required)));

      case STRUCT:
        return convert(path, name, (StructTypeInfo) type, required);

      case UNION:
        List<TypeInfo> unionTypes = ((UnionTypeInfo) type)

        // add NULL so all union types are optional
        List<Schema> types = Lists.newArrayList(NULL);
        for (int i = 0; i < unionTypes.size(); i += 1) {
          // types within unions cannot be required
              path, name + "_" + i, unionTypes.get(i), NO_REQUIRED_FIELDS));

        return Schema.createUnion(types);

        throw new IllegalArgumentException(
            "Unknown TypeInfo category: " + type.getCategory());

  static Schema optional(Schema schema) {
    return Schema.createUnion(Lists.newArrayList(NULL, schema));

  private static String doc(TypeInfo type) {
    if (type instanceof StructTypeInfo) {
      // don't add struct<a:t1,b:t2> when fields a and b will have doc strings
      return null;
    return "Converted from '" + String.valueOf(type) + "'";

  public static List<FieldSchema> convertSchema(Schema avroSchema) {
    List<FieldSchema> columns = Lists.newArrayList();
    if (Schema.Type.RECORD.equals(avroSchema.getType())) {
      for (Schema.Field field : avroSchema.getFields()) {
        columns.add(new FieldSchema(
            field.name(), convert(field.schema()).getTypeName(), field.doc()));
    } else {
      columns.add(new FieldSchema(
          "column", convert(avroSchema).getTypeName(), avroSchema.getDoc()));
    return columns;

  static TypeInfo convert(Schema schema) {
    return SchemaUtil.visit(schema, new Converter());

  private static class Converter extends SchemaUtil.SchemaVisitor<TypeInfo> {
    public TypeInfo record(Schema record, List<String> names, List<TypeInfo> types) {
      return TypeInfoFactory.getStructTypeInfo(names, types);

    public TypeInfo union(Schema union, List<TypeInfo> options) {
      // so no need to keep track of whether the avro type is nullable because
      // all Hive types are nullable
      List<TypeInfo> nonNullTypes = Lists.newArrayList();
      for (TypeInfo type : options) {
        if (type != null) {

      // handle a single field in the union
      if (nonNullTypes.size() == 1) {
        return nonNullTypes.get(0);

      return TypeInfoFactory.getUnionTypeInfo(nonNullTypes);

    public TypeInfo array(Schema array, TypeInfo element) {
      return TypeInfoFactory.getListTypeInfo(element);

    public TypeInfo map(Schema map, TypeInfo value) {
      return TypeInfoFactory.getMapTypeInfo(
          TYPE_TO_TYPEINFO.get(Schema.Type.STRING), value);

    public TypeInfo primitive(Schema primitive) {
      return TYPE_TO_TYPEINFO.get(primitive.getType());

  private static Collection<String[]> filterByStartsWith(
      Collection<String[]> fields, LinkedList<String> path, String name) {

    List<String[]> startsWithCollection = Lists.newArrayList();
    for (String[] field : fields) {
      if (startsWith(field, path)) {


    return startsWithCollection;

   * Returns true if left starts with right.
  private static boolean startsWith(String[] left, List<String> right) {
    // short circuit if a match isn't possible
    if (left.length < right.size()) {
      return false;

    for (int i = 0; i < right.size(); i += 1) {
      if (!left[i].equals(right.get(i))) {
        return false;

    return true;

  private static Collection<String[]> requiredFields(@Nullable PartitionStrategy strategy) {
    if (strategy == null) {
      return NO_REQUIRED_FIELDS;

    List<String[]> requiredFields = Lists.newArrayList();
    for (FieldPartitioner fp : Accessor.getDefault().getFieldPartitioners(strategy)) {
      // source name is not present for provided partitioners
      if (fp.getSourceName() != null) {

    return requiredFields;