* Copyright 2018 ZuInnoTe (Jörn Franke) <[email protected]>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*    http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package org.zuinnote.hadoop.excel.hive.daoserde;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import java.util.Map.Entry;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Writable;
import org.zuinnote.hadoop.excel.hive.serde.ExcelSerde;
import org.zuinnote.hadoop.office.format.common.dao.SpreadSheetCellDAO;

 * SerDe for Excel files (.xls,.xlsx) to retrieve each Cell in an Excel as a
 * dedicated row in Hive containing the information formattedValue, formula,
 * comment, cellAddress, Sheet. Writing is possible by providing one cell / row
 * in Hive (it does not prevent you having multiple cells in the same row in
 * Excel, you just use the right cell address, e.g. first row in Hive as cell
 * address A1 and second row in Hive cell address B1 then both entries are in the
 * same row in Excel)
public class ExcelSpreadSheetCellDAOSerde extends AbstractSerDe {
	private static final Log LOG = LogFactory.getLog(ExcelSpreadSheetCellDAOSerde.class.getName());
	private static final String[] columnNames = new String[]{"formattedValue","comment","formula","address","sheetName"};
	private ObjectInspector oi;
	public void initialize(Configuration conf, Properties tbl) throws SerDeException {
		this.initialize(conf, tbl, null);


	 * Initializes the SerDe \n You can define in the table properties (additionally
	 * to the standard Hive properties) the following options \n Any of the
	 * HadoopOffice options (hadoopoffice.*), such as encryption, signing, low
	 * footprint mode, linked workbooks, can be defined in the table properties @see
	 * <a href=
	 * "https://github.com/ZuInnoTe/hadoopoffice/wiki/Hadoop-File-Format">HadoopOffice
	 * configuration</a>\n
	 * @param conf                Hadoop Configuration
	 * @param prop                table properties.
	 * @param partitionProperties ignored. Partitions are not supported.

	public void initialize(Configuration conf, Properties prop, Properties partitionProperties) throws SerDeException {
		LOG.debug("Initializing Excel Hive Serde");
		// copy hadoopoffice options
		LOG.debug("Configuring HadoopOffice Format");
		Set<Entry<Object, Object>> entries = prop.entrySet();
		for (Entry<Object, Object> entry : entries) {
			if ((entry.getKey() instanceof String) && ((String) entry.getKey()).startsWith(ExcelSerde.HOSUFFIX)) {
				if (("TRUE".equalsIgnoreCase((String) entry.getValue()))
						|| ("FALSE".equalsIgnoreCase(((String) entry.getValue())))) {
					conf.setBoolean((String) entry.getKey(), Boolean.valueOf((String) entry.getValue()));
				} else {
					conf.set((String) entry.getKey(), (String) entry.getValue());
		LOG.debug("Configuring Object inspector");
		final List<ObjectInspector> columnOIs = new ArrayList<ObjectInspector>(ExcelSpreadSheetCellDAOSerde.columnNames.length);
		for (int i=0;i<ExcelSpreadSheetCellDAOSerde.columnNames.length;i++) {
		this.oi = ObjectInspectorFactory.getStandardStructObjectInspector(Arrays.asList(ExcelSpreadSheetCellDAOSerde.columnNames), columnOIs);
		LOG.debug("Finished Initialization");

	public Class<? extends Writable> getSerializedClass() {
		return SpreadSheetCellDAO.class;

	 * Writes a row in Hive containing exactly 5 elements (String) to a SpreadSheetCellDAO. Order: "formattedValue","comment","formula","address","sheetName"
	public Writable serialize(Object arg0, ObjectInspector arg1) throws SerDeException {
		if (!(arg1 instanceof StructObjectInspector)) {
			throw new SerDeException("Expect a row of Strings for serialization");
		final StructObjectInspector outputOI = (StructObjectInspector) arg1;
		final List<? extends StructField> outputFields = outputOI.getAllStructFieldRefs();
		if (outputFields.size()!=ExcelSpreadSheetCellDAOSerde.columnNames.length) {
			throw new SerDeException("Expected "+ExcelSpreadSheetCellDAOSerde.columnNames.length+" fields characterizing a cell: \"formattedValue\",\"comment\",\"formula\",\"address\",\"sheetName\", but found "+outputFields.size()+" fields");
		if (arg0==null) {
			return null;
		// get field data
		// formattedValue
		int columnNum=0;
		final Object foFormattedValue = outputOI.getStructFieldData(arg0, outputFields.get(columnNum));
		final ObjectInspector oiFormattedValue = outputFields.get(columnNum).getFieldObjectInspector();
		String formattedValue = String.valueOf(((PrimitiveObjectInspector) oiFormattedValue).getPrimitiveJavaObject(foFormattedValue));
		// comment
		final Object foComment= outputOI.getStructFieldData(arg0, outputFields.get(columnNum));
		final ObjectInspector oiComment = outputFields.get(columnNum).getFieldObjectInspector();
		String comment = String.valueOf(((PrimitiveObjectInspector) oiComment).getPrimitiveJavaObject(foComment));
		// formula
		final Object foFormula= outputOI.getStructFieldData(arg0, outputFields.get(columnNum));
		final ObjectInspector oiFormula = outputFields.get(columnNum).getFieldObjectInspector();
		String formula = String.valueOf(((PrimitiveObjectInspector) oiFormula).getPrimitiveJavaObject(foFormula));
		// address
		final Object foAddress= outputOI.getStructFieldData(arg0, outputFields.get(columnNum));
		final ObjectInspector oiAddress = outputFields.get(columnNum).getFieldObjectInspector();
		String address = String.valueOf(((PrimitiveObjectInspector) oiAddress).getPrimitiveJavaObject(foAddress));
		// sheetName
		final Object foSheetName= outputOI.getStructFieldData(arg0, outputFields.get(columnNum));
		final ObjectInspector oiSheetName = outputFields.get(columnNum).getFieldObjectInspector();
		String sheetName = String.valueOf(((PrimitiveObjectInspector) oiSheetName).getPrimitiveJavaObject(foSheetName));
		return new SpreadSheetCellDAO(formattedValue,comment,formula,address,sheetName);

	public SerDeStats getSerDeStats() {
		// no statistics supported
		return null;

	 * Returns one cell in an Excel as a row in Hive containing 5 elements:  "formattedValue","comment","formula","address","sheetName"
	public Object deserialize(Writable arg0) throws SerDeException {
		// check for null
		if ((arg0 == null) || (arg0 instanceof NullWritable)) {
			return null;
		if (!(arg0 instanceof SpreadSheetCellDAO)) {
			throw new SerDeException("Table does not contain objects of type SpreadSheetCellDAO. Did you use the ExcelCellInputFormat of HadoopOffice?");
		String[] spreadSheetCellRow = new String[5];
		SpreadSheetCellDAO obj = (SpreadSheetCellDAO)arg0;
		spreadSheetCellRow[0] = obj.getFormattedValue();
		spreadSheetCellRow[1] = obj.getComment();
		spreadSheetCellRow[2] = obj.getFormula();
		spreadSheetCellRow[3] = obj.getAddress();
		spreadSheetCellRow[4] = obj.getSheetName();
		return spreadSheetCellRow;

	public ObjectInspector getObjectInspector() throws SerDeException {
		return this.oi;