java source code of ExcelFileInputFormat

hadoopoffice-master
- gradle
  - wrapper
    - gradle-wrapper.properties
    - gradle-wrapper.jar
- flinkts
  - src
    - main
      - scala
        org
        zuinnote
        flink
        office
        excel
        ExcelFlinkTableSource.scala
        ExcelFlinkTableSink.scala
    - it
      - resources
        testsimple.xlsx
      - scala
        org
        zuinnote
        flink
        office
        excel
        FlinkScalaExcelTableSourceIntegrationSpec.scala
        FlinkScalaExcelTableSinkIntegrationSpec.scala
    - test
      - scala
        org
        zuinnote
        flink
        office
        excel
        FlinkScalaHadoopOfficeExcelDSSpec.scala
  - built.sbt
  - project
    - assembly.sbt
    - plugins.sbt
- examples
  - scala-spark2-excel-out-ds
    - src
      - main
        scala
        org
        zuinnote
        spark
        office
        example
        excel
        SparkScalaExcelOutDataSource.scala
      - it
        scala
        org
        zuinnote
        spark
        office
        example
        excel
        Spark2ScalaDSExcelOutIntegrationSpec.scala
    - project
      - assembly.sbt
      - plugins.sbt
    - build.sbt
  - mapreduce-exceloutput
    - src
      - main
        java
        org
        zuinnote
        hadoop
        office
        example
        driver
        CSV2ExcelDriver.java
        tasks
        HadoopOfficeExcelMap.java
        HadoopOfficeExcelReducer.java
      - integration-test
        resources
        simplecsv.csv
        java
        org
        zuinnote
        hadoop
        office
        example
        MapReduceExcelOutputIntegrationTest.java
    - build.gradle
  - scala-flinkts-excel
    - src
      - main
        scala
        org
        zuinnote
        flink
        office
        example
        excel
        FlinkTableSourceTableSinkExample.scala
      - it
        resources
        testsimple.xlsx
        scala
        org
        zuinnote
        flink
        office
        example
        excel
        FlinkTableSourceTableSinkScalaExcelIntegrationSpec.scala
      - test
        scala
        org
        zuinnote
        flink
        office
        example
        excel
        FlinkScalaHadoopOfficeExcelDSSpec.scala
    - project
      - assembly.sbt
      - plugins.sbt
    - build.sbt
  - mapreduce-excelinput
    - src
      - main
        java
        org
        zuinnote
        hadoop
        office
        example
        driver
        Excel2CSVDriver.java
        tasks
        HadoopOfficeExcelMap.java
        HadoopOfficeExcelReducer.java
      - integration-test
        resources
        excel2013test.xlsx
        java
        org
        zuinnote
        hadoop
        office
        example
        MapReduceExcelInputIntegrationTest.java
    - build.gradle
  - scala-spark2-excel-in-ds
    - src
      - main
        scala
        org
        zuinnote
        spark
        office
        example
        excel
        SparkScalaExcelInDataSource.scala
      - it
        resources
        excel2013test.xlsx
        scala
        org
        zuinnote
        spark
        office
        example
        excel
        Soark2ScalaDSExcelInIntegrationSpec.scala
    - project
      - assembly.sbt
      - plugins.sbt
    - build.sbt
  - hive-excel
    - hive-exceldao.sql
    - hive-excel.sql
  - scala-spark-excelinput
    - src
      - main
        scala
        org
        zuinnote
        spark
        office
        example
        excel
        SparkScalaExcelIn.scala
      - it
        resources
        excel2013test.xlsx
        scala
        org
        zuinnote
        spark
        office
        example
        excel
        SparkScalaExcelInputIntegrationSpec.scala
    - project
      - assembly.sbt
      - plugins.sbt
    - build.sbt
  - scala-spark-exceloutput
    - src
      - main
        scala
        org
        zuinnote
        spark
        office
        example
        excel
        SparkScalaExcelOut.scala
      - it
        resources
        simplecsv.csv
        scala
        org
        zuinnote
        spark
        office
        example
        excel
        SparkScalaExcelOutputIntegrationSpec.scala
    - project
      - assembly.sbt
      - plugins.sbt
    - build.sbt
  - scala-flinkds-excel-dao
    - src
      - main
        scala
        org
        zuinnote
        flink
        office
        example
        excel
        FlinkDSScalaExcelDAO.scala
      - it
        resources
        excel2013test.xlsx
        scala
        org
        zuinnote
        flink
        office
        example
        excel
        FlinkDSScalaExcelDAOIntegrationSpec.scala
      - test
        scala
        org
        zuinnote
        flink
        office
        example
        excel
        FlinkScalaHadoopOfficeExcelDSSpec.scala
    - project
      - assembly.sbt
      - plugins.sbt
    - build.sbt
  - scala-flinkds-excel-simple
    - src
      - main
        scala
        org
        zuinnote
        flink
        office
        example
        excel
        FlinkDSScalaExcelSimple.scala
      - it
        resources
        testsimple.xlsx
        scala
        org
        zuinnote
        flink
        office
        example
        excel
        FlinkDSScalaExcelSimpleIntegrationSpec.scala
      - test
        scala
        org
        zuinnote
        flink
        office
        example
        excel
        FlinkScalaHadoopOfficeExcelDSSpec.scala
    - project
      - assembly.sbt
      - plugins.sbt
    - build.sbt
- .travis
  - publish-junitreport.sh
  - publish-javadoc.sh
- fileformat
  - src
    - main
      - java
        org
        zuinnote
        hadoop
        office
        format
        common
        dao
        SpreadSheetCellDAOArrayWritable.java
        SpreadSheetCellDAO.java
        TextArrayWritable.java
        HadoopOfficeReadConfiguration.java
        OfficeWriter.java
        OfficeReader.java
        HadoopOfficeWriteConfiguration.java
        util
        CertificateChainVerificationUtil.java
        msexcel
        MSExcelOOXMLSignUtil.java
        MSExcelUtil.java
        HadoopKeyStoreManager.java
        converter
        ExcelConverterSimpleSpreadSheetCellDAO.java
        datatypes
        GenericBooleanDataType.java
        GenericDateDataType.java
        GenericNumericDataType.java
        GenericDoubleDataType.java
        GenericDataType.java
        GenericByteDataType.java
        GenericBigDecimalDataType.java
        GenericTimestampDataType.java
        GenericLongDataType.java
        GenericShortDataType.java
        GenericStringDataType.java
        GenericIntegerDataType.java
        GenericFloatDataType.java
        HadoopFileReader.java
        HadoopUtil.java
        writer
        OfficeWriterException.java
        OfficeSpreadSheetWriterInterface.java
        InvalidCellSpecificationException.java
        msexcel
        internal
        EncryptedTempData.java
        SecureSXSSFWorkbook.java
        EncryptedZipEntrySource.java
        MSExcelWriter.java
        MSExcelLowFootprintWriter.java
        InvalidWriterConfigurationException.java
        parser
        OfficeReaderParserInterface.java
        FormatNotUnderstoodException.java
        msexcel
        MSExcelParser.java
        MSExcelLowFootprintParser.java
        internal
        HSSFEventParser.java
        XSSFPullParser.java
        EncryptedCachedDiskStringsTable.java
        cache
        LRUCache.java
        XSSFEventParser.java
        mapred
        AbstractSpreadSheetDocumentFileOutputFormat.java
        ExcelCellFileInputFormat.java
        ExcelFileInputFormat.java
        AbstractSpreadSheetDocumentRecordWriter.java
        ExcelFileOutputFormat.java
        ExcelRecordWriter.java
        ExcelCellRecordReader.java
        ExcelRowFileOutputFormat.java
        ExcelRowRecordWriter.java
        ExcelRecordReader.java
        AbstractSpreadSheetDocumentRecordReader.java
        AbstractSpreadSheetDocumentFileInputFormat.java
        mapreduce
        AbstractSpreadSheetDocumentFileOutputFormat.java
        ExcelCellFileInputFormat.java
        ExcelFileInputFormat.java
        AbstractSpreadSheetDocumentRecordWriter.java
        ExcelFileOutputFormat.java
        ExcelRecordWriter.java
        ExcelCellRecordReader.java
        ExcelRowFileOutputFormat.java
        ExcelRowRecordWriter.java
        ExcelRecordReader.java
        AbstractSpreadSheetDocumentRecordReader.java
        AbstractSpreadSheetDocumentFileInputFormat.java
    - test
      - resources
        multisheetheader.xlsx
        odstest.ods
        odftexttest.odt
        excel2013linkedworkbooks.xlsx
        skipsheet.xlsx
        excel2013test.xlsx
        word2013test.docx
        excel2013empty.xlsx
        TESTTABLedbase5.DBF
        excel2003linkedworkbooks.xls
        cacerts
        alternatelocationlinkedwb
        excel2013linkedworkbookslink2.xlsx
        excel2003linkedworkbookslink1.xls
        excel2013linkedworkbookslink1.xlsx
        excel2003linkedworkbookslink2.xls
        TESTTABLedbase4.DBT
        datetimestamp.xlsx
        signingtruststore.jks
        templatetest1.xlsx
        testsigning.pfx
        templatetest1encrypt.xlsx
        excel2013encrypt.xlsx
        excel2013testemptyrows.xlsx
        excel2013testmultisheet.xlsx.gz
        excel2003testmultisheet.xls
        excel2013linkedworkbookslink2.xlsx
        excel2003encrypt.xls
        excel2013comment.xlsx
        excel2013testmultisheet.xlsx
        TESTTABLedbase4.DBF
        TESTTABLedbase5.DBT
        excel2013testmultisheet.xlsx.bz2
        testsimple.xlsx
        TESTTABLedbase3.DBT
        excel2003linkedworkbookslink1.xls
        testsigningCA.pfx
        excel2003testemptyrows.xls
        keystore.jceks
        excel2013linkedworkbookslink1.xlsx
        simplecsv.csv
        excel2003linkedworkbookslink2.xls
        TESTTABLedbase3.DBF
        excel2003empty.xls
        excel2003test.xls
      - java
        org
        zuinnote
        hadoop
        office
        format
        common
        OfficeReaderTest.java
        HadoopKeyStoreManagerTest.java
        util
        CertificateChainVerificationUtilTest.java
        converter
        ExcelConverterSimpleSpreadSheetCellDAOTest.java
        OfficeCheckTestDataAvailable.java
        mapred
        OfficeFormatHadoopExcelLowFootPrintStaXTest.java
        OfficeFormatHadoopExcelLowFootPrintSAXTest.java
        mapreduce
        OfficeFormatHadoopExcelLowFootPrintStaXTest.java
        OfficeFormatHadoopExcelLowFootPrintSAXTest.java
  - build.gradle
- gradlew.bat
- LICENSE
- gradlew
- sonar-project.properties
- flinkds
  - src
    - main
      - java
        org
        zuinnote
        flink
        office
        excel
        RowSimpleExcelFlinkFileOutputFormat.java
        ExcelFlinkFileInputFormat.java
        SimpleExcelFlinkFileInputFormat.java
        RowSimpleExcelFlinkFileInputFormat.java
        ExcelFlinkFileOutputFormat.java
        SimpleExcelFlinkFileOutputFormat.java
        AbstractSpreadSheetFlinkFileInputFormat.java
        common
        FlinkKeyStoreManager.java
        FlinkFileReader.java
        AbstractSpreadSheetFlinkFileOutputFormat.java
    - test
      - resources
        excel2013test.xlsx
        cacerts
        testsigning.pfx
        testsimple.xlsx
        keystore.jceks
        excel2003test.xls
      - java
        org
        zuinnote
        flink
        office
        excel
        FlinkSimpleExcelFileInputFormatTest.java
        FlinkExcelFileInputFormatTest.java
        FlinkExcelFileOutputFormatTest.java
        FlinkSimpleExcelFileOutputFormatTest.java
        common
        FlinkKeyStoreManagerTest.java
  - build.gradle
- CONTRIBUTING.md
- SECURITY.md
- build.gradle
- .travis.yml
- README.md
- settings.gradle
- NOTICE
- .gitignore
- hiveserde
  - src
    - main
      - resources
        hive-deploy.sql
      - java
        org
        zuinnote
        hadoop
        excel
        hive
        outputformat
        HiveExcelRowFileOutputFormat.java
        HiveExcelCellFileOutputFormat.java
        daoserde
        ExcelSpreadSheetCellDAOStorageFormatDescriptor.java
        ExcelSpreadSheetCellDAOSerde.java
        serde
        ExcelStorageFormatDescriptor.java
        ExcelSerde.java
    - test
      - resources
        testdata
        excel2013test.xlsx
        testsimple.xlsx
        excel2003test.xls
      - java
        org
        zuinnote
        hadoop
        excel
        hive
        daoserde
        ExcelSpreadSheetCellDAOSerdeTest.java
        serde
        ExcelTextSerdeTest.java
  - build.gradle

/**
* Copyright 2016 ZuInnoTe (Jörn Franke) <[email protected]>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*    http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
**/

package org.zuinnote.hadoop.office.format.mapred;

import java.io.IOException;

import org.apache.hadoop.mapred.InputSplit;


import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RecordReader;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.ArrayWritable;

import org.apache.commons.logging.LogFactory;
import org.apache.commons.logging.Log;

import java.security.GeneralSecurityException;

import org.zuinnote.hadoop.office.format.common.HadoopOfficeReadConfiguration;
import org.zuinnote.hadoop.office.format.common.parser.*;

public class ExcelFileInputFormat extends AbstractSpreadSheetDocumentFileInputFormat<ArrayWritable> {

private static final Log LOGIF = LogFactory.getLog(ExcelFileInputFormat.class.getName());

@Override
public  RecordReader<Text,ArrayWritable> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
/** Create reader **/
try {
		 // send configuration option to ms excel. The format of the Excel (old vs new) is detected automaitcally
 		job.set(HadoopOfficeReadConfiguration.CONF_MIMETYPE,"ms-excel");
		return new ExcelRecordReader( (FileSplit) split,job,reporter);
	} catch (FormatNotUnderstoodException e) {
		// log
		LOGIF.error(e);
	} catch (GeneralSecurityException gse) {
		LOGIF.error(gse);
	}
return null;
}

@Override
public void configure (JobConf conf) {
		// not used
	} 

	

	/**
	 * Unfortunately, we cannot split Excel documents correctly. Apache POI/library requires full documents.
	 * Nevertheless, most of the time you have anyway small (smaller than default HDFS blocksize) Office documents that can be processed fast. 
	 * Hence, you should put them in Hadoop Archives (HAR) either uncompressed or compressed to reduce load on namenode.
	 *
	*/
@Override
protected boolean isSplitable(FileSystem fs, Path file) {
	return false;
}	


}