java source code of BaseJsonSerDe

package com.esri.hadoop.hive.serde;

import java.io.IOException;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import java.util.TimeZone;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.hive.serde2.io.ByteWritable;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.hive.serde2.io.ShortWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
import org.apache.hadoop.hive.serde2.lazy.LazyPrimitive;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonGenerationException;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.core.JsonToken;

import com.esri.core.geometry.ogc.OGCGeometry;
import com.esri.hadoop.hive.GeometryUtils;
import com.esri.hadoop.shims.HiveShims;


abstract public class BaseJsonSerDe extends AbstractSerDe {
	static final Log LOG = LogFactory.getLog(BaseJsonSerDe.class.getName());

	static protected JsonFactory jsonFactory = new JsonFactory();
	static protected TimeZone tz = TimeZone.getDefault();

	protected int numColumns;
	protected int geometryColumn = -1;
	protected ArrayList<String> columnNames;
	protected ArrayList<ObjectInspector> columnOIs;
	protected boolean [] columnSet; 
	protected StructObjectInspector rowOI; // contains the type information for the fields returned
	protected String attrLabel = "attributes";  // "properties"
	
	/* rowBase keeps a base copy of the Writable for each field so they can be reused for 
	 * all records. When deserialize is called, row is initially nulled out. Then for each attribute
	 * found in the JSON record the Writable reference is copied from rowBase to row
	 * and set to the appropriate value.  Then row is returned.  This why values don't linger from 
	 * previous records.
	 */
	ArrayList<Writable> rowBase; 
	ArrayList<Writable> row;
	
	@Override
	public void initialize(Configuration cfg, Properties tbl) throws SerDeException {
				
		geometryColumn = -1;

	    // Read the configuration parameters
		String columnNameProperty = tbl.getProperty(HiveShims.serdeConstants.LIST_COLUMNS);
		String columnTypeProperty = tbl.getProperty(HiveShims.serdeConstants.LIST_COLUMN_TYPES);

		ArrayList<TypeInfo> typeInfos = TypeInfoUtils
				.getTypeInfosFromTypeString(columnTypeProperty);

		columnNames = new ArrayList<String>();
		columnNames.addAll(Arrays.asList(columnNameProperty.toLowerCase().split(",")));

		numColumns = columnNames.size();
		
		columnOIs = new ArrayList<ObjectInspector>(numColumns);
		columnSet = new boolean[numColumns];
		
		for (int c = 0; c < numColumns; c++) {

			TypeInfo colTypeInfo = typeInfos.get(c);
			
			if (colTypeInfo.getCategory() != Category.PRIMITIVE){
				throw new SerDeException("Only primitive field types are accepted");
			}
			
			if (colTypeInfo.getTypeName().equals("binary")) {

				if (geometryColumn >= 0) {
					// only one column can be defined as binary for geometries
					throw new SerDeException(
							"Multiple binary columns defined.  Define only one binary column for geometries");
				}

				columnOIs.add(GeometryUtils.geometryTransportObjectInspector);
				geometryColumn = c;
			} else {
				columnOIs.add(TypeInfoUtils.getStandardWritableObjectInspectorFromTypeInfo(colTypeInfo));
			}
		}

		// standardStruct uses ArrayList to store the row.
		rowOI = ObjectInspectorFactory.getStandardStructObjectInspector(
				columnNames, columnOIs);

		// constructing the row objects, etc, which will be reused for all rows.
		rowBase = new ArrayList<Writable>(numColumns);
		row = new ArrayList<Writable>(numColumns);
		
		// set each value in rowBase to the writable that corresponds with its PrimitiveObjectInspector
		for (int c = 0; c < numColumns; c++) {
			
			PrimitiveObjectInspector poi = (PrimitiveObjectInspector)columnOIs.get(c);
			Writable writable;
			
			try {
				writable = (Writable)poi.getPrimitiveWritableClass().newInstance();
			} catch (InstantiationException e) {
				throw new SerDeException("Error creating Writable from ObjectInspector", e);
			} catch (IllegalAccessException e) {
				throw new SerDeException("Error creating Writable from ObjectInspector", e);
			}
			
			rowBase.add(writable);
			row.add(null); // default all values to null
		}
	}  // /initialize

	@Override
	public Object deserialize(Writable json_in) throws SerDeException {
		Text json = (Text) json_in;

		// null out array because we reuse it and we don't want values persisting
		// from the last record
		for (int i=0;i<numColumns;i++)
			row.set(i, null);
		
		try {
			JsonParser parser = jsonFactory.createJsonParser(json.toString());

			JsonToken token = parser.nextToken();

			while (token != null) {

				if (token == JsonToken.START_OBJECT) {
					if ("geometry".equals(parser.getCurrentName())) {
						if (geometryColumn > -1) {
							// create geometry and insert into geometry field
							OGCGeometry ogcGeom = parseGeom(parser);
							row.set(geometryColumn, ogcGeom == null ? null :
									GeometryUtils.geometryToEsriShapeBytesWritable(ogcGeom));
						} else {
							// no geometry in select field set, don't even bother parsing
							parser.skipChildren();
						}
					} else if (attrLabel.equals(parser.getCurrentName())) {

						token = parser.nextToken();

						while (token != JsonToken.END_OBJECT && token != null) {

							// hive makes all column names in the queries column list lower case
							String name = parser.getText().toLowerCase();

							parser.nextToken();

							// figure out which column index corresponds with the attribute name
							int fieldIndex = columnNames.indexOf(name);

							if (fieldIndex >= 0) {
								setRowFieldFromParser(fieldIndex, parser);
							}

							token = parser.nextToken();
						}

						token = parser.nextToken();
					}
				}

				token = parser.nextToken();
			}

		} catch (JsonParseException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

		return row;
	}

	@Override
	public ObjectInspector getObjectInspector() throws SerDeException {
		return rowOI;
	}

	@Override
	public SerDeStats getSerDeStats() {
		return null;
	}

	@Override
	public Class<? extends Writable> getSerializedClass() {
		return Text.class;
	}

	@Override
	public Writable serialize(Object obj, ObjectInspector oi)
			throws SerDeException {

		StandardStructObjectInspector structOI = (StandardStructObjectInspector) oi;

		// get list of writables, one for each field in the row
		List<Object> fieldWritables = structOI.getStructFieldsDataAsList(obj);

		StringWriter writer = new StringWriter();

		try {
			JsonGenerator jsonGen = jsonFactory.createJsonGenerator(writer);

			jsonGen.writeStartObject();

			// first write attributes
			jsonGen.writeObjectFieldStart(attrLabel);

			for (int i = 0; i < fieldWritables.size(); i++) {
				if (i == geometryColumn)
					continue; // skip geometry, it comes later

				try {
					generateJsonFromValue(fieldWritables.get(i), i, jsonGen);
				} catch (JsonProcessingException e) {
					e.printStackTrace();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}

			jsonGen.writeEndObject();

			// if geometry column exists, write it
			if (geometryColumn > -1) {
				Object got = fieldWritables.get(geometryColumn);
				if (got == null) {
					jsonGen.writeObjectField("geometry", null);
				} else {
					BytesWritable bytesWritable = null;
					if (got instanceof BytesWritable)
						bytesWritable = (BytesWritable)got;
					else  // SparkSQL, #97
						bytesWritable = new BytesWritable((byte[])got);  // idea: avoid extra object
					OGCGeometry ogcGeometry = GeometryUtils.geometryFromEsriShape(bytesWritable);
					jsonGen.writeRaw(",\"geometry\":" + outGeom(ogcGeometry));
				}
			}

			jsonGen.writeEndObject();

			jsonGen.close();

		} catch (JsonGenerationException e) {
			LOG.error("Error generating JSON", e);
			return null;
		} catch (IOException e) {
			LOG.error("Error generating JSON", e);
			return null;
		}

		return new Text(writer.toString());
	}


	/**
	 * Send to the generator, the value of the cell, using column type
	 * 
	 * @param value The attribute value as the object given by Hive
	 * @param fieldIndex column index of field in row
	 * @param jsonGen JsonGenerator
	 * @throws JsonProcessingException
	 * @throws IOException
	 */
	private void generateJsonFromValue(Object value, int fieldIndex, JsonGenerator jsonGen)
		throws JsonProcessingException, IOException {
		String label = columnNames.get(fieldIndex);
		PrimitiveObjectInspector poi = (PrimitiveObjectInspector)this.columnOIs.get(fieldIndex);
		if (value == null) {
			jsonGen.writeObjectField(label, null);
		} else if (value instanceof LazyPrimitive<?,?>) {  // have seen LazyString, #25
			generateJsonFromLazy((LazyPrimitive<?,?>)value, fieldIndex, label, poi, jsonGen);
		} else if (value instanceof Writable) {
			generateJsonFromWritable((Writable)value, fieldIndex, label, poi, jsonGen);
		} else {  // SparkSQL, #97
			jsonGen.writeObjectField(label, value);
		}
	}
	private void generateJsonFromLazy(LazyPrimitive<?,?> value, int fieldIndex, String label,
									  PrimitiveObjectInspector poi, JsonGenerator jsonGen)
		throws JsonProcessingException, IOException {
		generateJsonFromWritable(value.getWritableObject(), fieldIndex, label, poi, jsonGen);
	}

	private void generateJsonFromWritable(Writable value, int fieldIndex, String label,
										  PrimitiveObjectInspector poi, JsonGenerator jsonGen)
		throws JsonProcessingException, IOException {
		Object prim = poi.getPrimitiveJavaObject(value);
		if (prim instanceof java.util.Date) {
			long epoch = ((java.util.Date)prim).getTime();
			long offset = prim instanceof java.sql.Timestamp ? 0 : tz.getOffset(epoch);
			jsonGen.writeObjectField(label, epoch - offset);  // UTC
		} else {
			jsonGen.writeObjectField(label, prim);
		}
	}

    // Write OGCGeometry to JSON
	abstract protected String outGeom(OGCGeometry geom);

    // Parse OGCGeometry from JSON
	abstract protected OGCGeometry parseGeom(JsonParser parser);

	private java.sql.Date parseDate(JsonParser parser) throws JsonParseException, IOException {
		java.sql.Date jsd = null;
		if (JsonToken.VALUE_NUMBER_INT.equals(parser.getCurrentToken())) {
			long epoch = parser.getLongValue();
			jsd = new java.sql.Date(epoch);
		} else try {
			long epoch = parseTime(parser.getText(), "yyyy-MM-dd");
			jsd = new java.sql.Date(epoch);
		} catch (java.text.ParseException e) {
			// null
		}
		return jsd;
	}

	private java.sql.Timestamp parseTime(JsonParser parser) throws JsonParseException, IOException {
		java.sql.Timestamp jst = null;
		if (JsonToken.VALUE_NUMBER_INT.equals(parser.getCurrentToken())) {
			long epoch = parser.getLongValue();
			jst = new java.sql.Timestamp(epoch);
		} else {
			String value = parser.getText();
			int point = value.indexOf('.');
			if (point >= 0) {
				jst = parseTime(value.substring(0,point+4));  // "yyyy-MM-dd HH:mm:ss.SSS" - truncate
				// idea: jst.setNanos; alt: Java-8, JodaTime, javax.xml.bind.DatatypeConverter
			} else {
				jst = parseTime(value);    // "yyyy-MM-dd HH:mm:ss.SSS"
				String[] formats = {"yyyy-MM-dd HH:mm:ss",	"yyyy-MM-dd HH:mm", "yyyy-MM-dd"};
				for (String format: formats) {
					if (jst != null) break;
					try {
						jst = new java.sql.Timestamp(parseTime(value, format));
					} catch (java.text.ParseException e) {
						// remain null
					}
				}
			}
		}
		return jst;
	}

	private java.sql.Timestamp parseTime(String value) {
		try {
			return java.sql.Timestamp.valueOf(value);
		} catch (IllegalArgumentException iae) {
			return null;
		}
	}

	private long parseTime(String value, String format) throws java.text.ParseException {  // epoch
		return new java.text.SimpleDateFormat(format).parse(value).getTime();
	}

	/**
	 * Copies the Writable at fieldIndex from rowBase to row, then sets the value of the Writable
	 * to the value in parser
	 * 
	 * @param fieldIndex column index of field in row
	 * @param parser JsonParser pointing to the attribute
	 * @throws JsonParseException
	 * @throws IOException
	 */
	private void setRowFieldFromParser(int fieldIndex, JsonParser parser) throws JsonParseException, IOException{

		PrimitiveObjectInspector poi = (PrimitiveObjectInspector)this.columnOIs.get(fieldIndex);
		if (JsonToken.VALUE_NULL == parser.getCurrentToken())
			return;  // leave the row-cell as null

		// set the field in the row to the writable from rowBase
		row.set(fieldIndex, rowBase.get(fieldIndex));

		switch (poi.getPrimitiveCategory()){
		case BYTE:
			((ByteWritable)row.get(fieldIndex)).set(parser.getByteValue());
			break;
		case SHORT:
			((ShortWritable)row.get(fieldIndex)).set(parser.getShortValue());
			break;
		case INT:
			((IntWritable)row.get(fieldIndex)).set(parser.getIntValue());
			break;
		case LONG:
			((LongWritable)row.get(fieldIndex)).set(parser.getLongValue());
			break;
		case DOUBLE:
			((DoubleWritable)row.get(fieldIndex)).set(parser.getDoubleValue());
			break;
		case FLOAT:
			((FloatWritable)row.get(fieldIndex)).set(parser.getFloatValue());
			break;
		case BOOLEAN:
			((BooleanWritable)row.get(fieldIndex)).set(parser.getBooleanValue());
			break;
		case DATE:    // DateWritable stores days not milliseconds.
			((DateWritable)row.get(fieldIndex)).set(parseDate(parser));
			break;
		case TIMESTAMP:
			((TimestampWritable)row.get(fieldIndex)).set(parseTime(parser));
			break;
		default:    // STRING/unrecognized
			((Text)row.get(fieldIndex)).set(parser.getText());
			break;	
		}
	}

}