java source code of IndexerPipeline

/*******************************************************************************
 * Copyright 2017 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package com.google.cloud.dataflow.examples.opinionanalysis;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.sql.ResultSet;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ThreadLocalRandom;

import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.coders.AvroCoder;
import org.apache.beam.sdk.coders.DefaultCoder;
import org.apache.beam.sdk.coders.KvCoder;
import org.apache.beam.sdk.coders.StringUtf8Coder;
import org.apache.beam.sdk.coders.VarLongCoder;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.io.TextIO.Read;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
import org.apache.beam.sdk.io.gcp.bigtable.BigtableIO;
import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO;
import org.apache.beam.sdk.io.jdbc.JdbcIO;
import org.apache.beam.sdk.io.jdbc.JdbcIO.RowMapper;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.Flatten;
import org.apache.beam.sdk.transforms.GroupByKey;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.View;
import org.apache.beam.sdk.transforms.DoFn.ProcessContext;
import org.apache.beam.sdk.transforms.DoFn.ProcessElement;
import org.apache.beam.sdk.transforms.join.CoGbkResult;
import org.apache.beam.sdk.transforms.join.CoGroupByKey;
import org.apache.beam.sdk.transforms.join.KeyedPCollectionTuple;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionList;
import org.apache.beam.sdk.values.PCollectionTuple;
import org.apache.beam.sdk.values.PCollectionView;
import org.apache.beam.sdk.values.TupleTag;
import org.apache.beam.sdk.values.TupleTagList;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.io.IOUtils;
import org.joda.time.Instant;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.api.services.bigquery.model.TableFieldSchema;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.bigtable.v2.Mutation;
import com.google.cloud.bigtable.config.BigtableOptions;
import com.google.cloud.dataflow.examples.opinionanalysis.IndexerPipelineUtils.ExtractCommentInfoFn;
import com.google.cloud.dataflow.examples.opinionanalysis.IndexerPipelineUtils.ExtractPostDataFn;
import com.google.cloud.dataflow.examples.opinionanalysis.io.RecordFileSource;
import com.google.cloud.dataflow.examples.opinionanalysis.model.InputContent;
import com.google.cloud.dataflow.examples.opinionanalysis.solutions.FileIndexerPipelineOptions;
import com.google.cloud.dataflow.examples.opinionanalysis.transforms.Reshuffle;
import com.google.cloud.dataflow.examples.opinionanalysis.util.PartitionedTableRef;
import com.google.cloud.dataflow.examples.opinionanalysis.util.PipelineTags;

import com.google.cloud.language.v1.Document.Type;
import com.google.cloud.language.v1.EncodingType;
import com.google.cloud.language.v1.Entity;
import com.google.cloud.language.v1.LanguageServiceClient;
import com.google.cloud.language.v1.Sentiment;
import com.google.cloud.language.v1.AnalyzeEntitiesRequest;
import com.google.cloud.language.v1.AnalyzeEntitiesResponse;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.protobuf.ByteString;

import sirocco.indexer.Indexer;
import sirocco.indexer.IndexingConsts;
import sirocco.indexer.util.LogUtils;
import sirocco.model.ContentIndex;

import sirocco.model.summary.ContentIndexSummary;
import sirocco.model.summary.Document;
import sirocco.model.summary.DocumentTag;
import sirocco.model.summary.WebResource;

public class IndexerPipeline {
	private static final Logger LOG = LoggerFactory.getLogger(IndexerPipeline.class);
	private static final long REPORT_LONG_INDEXING_DURATION = 10000; // Report indexing duration longer than 10s.  
	private static final String EMPTY_TITLE_KEY_PREFIX = "No Title"; // Used in text dedupe grouping.  
	
	public static void main(String[] args) throws Exception {
		
		IndexerPipelineOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(IndexerPipelineOptions.class);

	    Pipeline pipeline = createIndexerPipeline(options); 
		
		pipeline.run();

	}

	/**
	 * This function creates the DAG graph of transforms. It can be called from main()
	 * as well as from the ControlPipeline.
	 * @param options
	 * @return
	 * @throws Exception
	 */
	public static Pipeline createIndexerPipeline(IndexerPipelineOptions options) throws Exception {
		
	    IndexerPipelineUtils.validateIndexerPipelineOptions(options);
		Pipeline pipeline = Pipeline.create(options);
		
		// PHASE: Read raw content from sources
		PCollection<InputContent> readContent = null;
		
		if (options.isSourcePubsub()) {
			
			readContent = pipeline
				.apply(PubsubIO.readStrings().fromTopic(options.getPubsubTopic())) 
				.apply(ParDo.of(new ParseRawInput()));
		} else if (options.isSourceJDBC()){
			
			readContent = pipeline
				.apply(readDBRows(options));
		} else if (options.isSourceRedditBQ()) {	
			
			//  Read Reddit Posts and Comments and then join them using CoGroupByKey
			readContent = joinRedditPostsAndComments(pipeline, options);
		} else if (options.isSourceGDELTbucket()) {	
			
			readContent = pipeline
				.apply("Read GDELT files", TextIO.read().from(options.getInputFile()))
				.apply("Parse JSON into InputContent", ParDo.of(new ParseGDELTJsonInput()));
		} else {
			
			// Read from GCS files
			/*
			readContent = pipeline
				.apply("Read from GCS files", org.apache.beam.sdk.io.Read.from(new RecordFileSource<String>(
					ValueProvider.StaticValueProvider.of(options.getInputFile()), 
					StringUtf8Coder.of(), RecordFileSource.DEFAULT_RECORD_SEPARATOR)))
				.apply(ParDo.of(new ParseRawInput())); 
			*/
			Read r = TextIO.read().from(options.getInputFile());
			if (options.getRecordDelimiters()!=null && !options.getRecordDelimiters().isEmpty())  
				r = r.withDelimiter(IndexerPipelineUtils.extractRecordDelimiters(options.getRecordDelimiters()));
				
			DoFn<String,InputContent> tr = (options.getReadAsCSV())? 
				new ParseCSVFile() : new ParseRawInput();
					
			readContent = pipeline
				.apply("Read from GCS files", r)
				.apply("Parse input into InputContent objects",ParDo.of(tr)); 
			
		}
		
		// PHASE: Filter already processed URLs
		// If we are supposed to truncate destination tables before write, then don't do any
		// extra filtering based on what is in the destination tables.
		// Otherwise, obtain a cache of already processed URLs and remove from indexing set 
		// the items that are already in the destination Bigquery tables
		PCollection<InputContent> contentToProcess = options.getWriteTruncate() ? readContent:
			filterAlreadyProcessedUrls(readContent, pipeline, options);
		
		// PHASE: Filter by a special Skip flag in the input records
		// Split the remaining items into items to index and items just to create as webresource
		// based on skipIndexing flag
		ContentToIndexOrNot contentPerSkipFlag = filterBasedOnSkipFlag(contentToProcess);
		
		PCollection<InputContent> contentToIndexNotSkipped = contentPerSkipFlag.contentToIndex;
		PCollection<InputContent> contentNotToIndexSkipped = contentPerSkipFlag.contentNotToIndex;

		
		// Define the accumulators of all filters
		PCollection<InputContent> contentToIndex = null;
		PCollection<InputContent> contentNotToIndex = null;

		
		// PHASE: If we were instructed to de-duplicate based on exact wording of documents,
		// split items in main flow based on whether they are dupes or not
		if (options.getDedupeText()) {

			ContentToIndexOrNot content = filterAlreadyProcessedDocuments(
				contentToIndexNotSkipped,
				contentNotToIndexSkipped, pipeline, options);
			
			contentToIndex = content.contentToIndex;
			contentNotToIndex = content.contentNotToIndex;
			
		} else {
			contentToIndex = contentToIndexNotSkipped;
			contentNotToIndex = contentNotToIndexSkipped;
		}

		// Process content that does not need to be indexed and just needs to be stored as a webresource
		PCollection<TableRow> webresourceRowsUnindexed = contentNotToIndex
			.apply(ParDo.of(new CreateWebresourceTableRowFromInputContentFn()));
		
		// PHASE: Index documents (extract opinions and entities/tags). 
		// Return successfully indexed docs, and create a Bigtable write transform to store errors 
		// in Dead Letter table.
		PCollection<ContentIndexSummary> indexes = indexDocuments(options, contentToIndex);
		
		PCollection<ContentIndexSummary> filteredIndexes = null;
		PCollection<TableRow> webresourceDeduped = null;
		
		// PHASE: Filter "soft" duplicates
		// After Indexing, do another grouping by Title, Round(Length/1000), and Tags
		// This grouping needs to happen after the "indexing" operation because we will be using Tags identified
		// by indexing as one of the grouping elements. 
		// This type of grouping and filtering will catch small variations in text (e.g. in copyright notices, bylines, etc) 
		if (options.getDedupeText()) {

			ContentDuplicateOrNot contentDuplicateOrNot = filterSoftDuplicates(indexes);
			
			filteredIndexes = contentDuplicateOrNot.uniqueIndexes;
			webresourceDeduped = contentDuplicateOrNot.duplicateWebresources;
			
		} else {
			filteredIndexes = indexes;
		}
		
		//PHASE: Enrich with CloudNLP entities

		if (options.getRatioEnrichWithCNLP() > 0)
			filteredIndexes = enrichWithCNLP(filteredIndexes, options.getRatioEnrichWithCNLP());
		
		// PHASE: Write to BigQuery
		// For the Indexes that are unique ("filteredIndexes"), create records in webresource, document, and sentiment.
		// Then, merge resulting webresources with webresourceRowsUnindexed and webresourceDeduped
		PCollectionTuple bqrows= filteredIndexes
			.apply(ParDo.of(new CreateTableRowsFromIndexSummaryFn())
				.withOutputTags(PipelineTags.webresourceTag, // main output collection
					TupleTagList.of(PipelineTags.documentTag).and(PipelineTags.sentimentTag))); // 2 side output collections
		
		writeAllTablesToBigQuery(bqrows, webresourceRowsUnindexed, webresourceDeduped, options);
		
		return pipeline;
	}

	/**
	 * @param filteredIndexes
	 * @return
	 */
	private static PCollection<ContentIndexSummary> enrichWithCNLP(
			PCollection<ContentIndexSummary> filteredIndexes, Float ratio) {
		
		PCollectionTuple splitAB = filteredIndexes
			.apply(ParDo.of(new SplitAB(ratio))
				.withOutputTags(PipelineTags.BranchA,  
					TupleTagList.of(PipelineTags.BranchB))); 
		
		PCollection<ContentIndexSummary> branchACol = splitAB.get(PipelineTags.BranchA);
		PCollection<ContentIndexSummary> branchBCol = splitAB.get(PipelineTags.BranchB);
		
		PCollection<ContentIndexSummary> enrichedBCol = branchBCol.apply(
			ParDo.of(new EnrichWithCNLPEntities()));
		
		//Merge all collections with WebResource table records
		PCollectionList<ContentIndexSummary> contentIndexSummariesList = 
			PCollectionList.of(branchACol).and(enrichedBCol);
		PCollection<ContentIndexSummary> allIndexSummaries = 
			contentIndexSummariesList.apply(Flatten.<ContentIndexSummary>pCollections());

		filteredIndexes = allIndexSummaries;
		return filteredIndexes;
	}

	/**
	 * @param options
	 * @param contentToIndex
	 * @return
	 */
	private static PCollection<ContentIndexSummary> indexDocuments(
			IndexerPipelineOptions options,
			PCollection<InputContent> contentToIndex) {
		
		PCollectionTuple alldocuments = contentToIndex
			.apply(ParDo.of(new IndexDocument())
				.withOutputTags(PipelineTags.successfullyIndexed, // main output
					TupleTagList.of(PipelineTags.unsuccessfullyIndexed))); // side output

		PCollection<ContentIndexSummary> indexes = alldocuments
				.get(PipelineTags.successfullyIndexed)
				.setCoder(AvroCoder.of(ContentIndexSummary.class));

		// if the Bigtable admin DB is set, write into dead letter table
		if (options.getBigtableIndexerAdminDB() != null) {
			
			
			PCollection<InputContent> unprocessedDocuments = alldocuments
				.get(PipelineTags.unsuccessfullyIndexed);
			
			BigtableOptions.Builder optionsBuilder =
				new BigtableOptions.Builder()
					.setProjectId(options.getProject())
					.setInstanceId(options.getBigtableIndexerAdminDB());
			BigtableOptions bigtableOptions = optionsBuilder.build();
			
			unprocessedDocuments
				.apply(ParDo.of(new CreateDeadLetterEntries()))
				.apply("Write to Dead Letter table in Bigtable", BigtableIO.write()
						.withBigtableOptions(bigtableOptions)
						.withTableId(IndexerPipelineUtils.DEAD_LETTER_TABLE));
		}
		
		return indexes;
	}


	/**
	 * @param contentToProcess
	 * @return
	 */
	private static ContentToIndexOrNot filterBasedOnSkipFlag(PCollection<InputContent> contentToProcess) {
		PCollectionTuple indexOrNotBasedOnSkipFlag = contentToProcess
			.apply("Filter items to index based on skipIndexing flag", ParDo.of(new FilterItemsToIndex())
				.withOutputTags(PipelineTags.contentToIndexNotSkippedTag, // main output collection
					TupleTagList.of(PipelineTags.contentNotToIndexSkippedTag))); // side output collection		

		
		ContentToIndexOrNot contentPerSkipFlag = new ContentToIndexOrNot(
			indexOrNotBasedOnSkipFlag.get(PipelineTags.contentToIndexNotSkippedTag), 
			indexOrNotBasedOnSkipFlag.get(PipelineTags.contentNotToIndexSkippedTag));
		
		return contentPerSkipFlag;
	}

	/**
	 * @param contentToIndexNotSkipped
	 * @param contentNotToIndexSkipped
	 * @param pipeline
	 * @param options
	 * @return
	 */
	private static ContentToIndexOrNot filterAlreadyProcessedDocuments(
			PCollection<InputContent> contentToIndexNotSkipped, PCollection<InputContent> contentNotToIndexSkipped,
			Pipeline pipeline, IndexerPipelineOptions options) {
		PCollection<KV<String,Long>> alreadyProcessedDocs = null;
		
		if (!options.getWriteTruncate()) {
			String query = IndexerPipelineUtils.buildBigQueryProcessedDocsQuery(options);
			alreadyProcessedDocs = pipeline
				.apply("Get already processed Documents",BigQueryIO.read().fromQuery(query))
				.apply(ParDo.of(new GetDocumentHashFn()));

		} else {
			Map<String, Long> map = new HashMap<String,Long>();
			alreadyProcessedDocs = pipeline
				.apply("Create empty side input of Docs",
					Create.of(map).withCoder(KvCoder.of(StringUtf8Coder.of(),VarLongCoder.of())));
		}			
		
		final PCollectionView<Map<String,Long>> alreadyProcessedDocsSideInput =  
			alreadyProcessedDocs.apply(View.<String,Long>asMap());
		
		PCollectionTuple indexOrNotBasedOnExactDupes = contentToIndexNotSkipped
			.apply("Extract DocumentHash key", ParDo.of(new GetInputContentDocumentHashFn()))
			.apply("Group by DocumentHash key", GroupByKey.<String, InputContent>create())
			.apply("Eliminate InputContent Dupes", ParDo.of(new EliminateInputContentDupes(alreadyProcessedDocsSideInput))
				.withSideInputs(alreadyProcessedDocsSideInput)
				.withOutputTags(PipelineTags.contentToIndexNotExactDupesTag, // main output collection
					TupleTagList.of(PipelineTags.contentNotToIndexExactDupesTag))); // side output collection	
		
		PCollection<InputContent> contentToIndexNotExactDupes = indexOrNotBasedOnExactDupes.get(PipelineTags.contentToIndexNotExactDupesTag);
		PCollection<InputContent> contentNotToIndexExactDupes = indexOrNotBasedOnExactDupes.get(PipelineTags.contentNotToIndexExactDupesTag);
		
		// Merge the sets of items that are dupes or skipped
		PCollectionList<InputContent> contentNotToIndexList = PCollectionList.of(contentNotToIndexExactDupes).and(contentNotToIndexSkipped);
		
		ContentToIndexOrNot content = new ContentToIndexOrNot(contentToIndexNotExactDupes, contentNotToIndexList.apply(Flatten.<InputContent>pCollections()));
		return content;
	}

	/**
	 * @param options
	 * @param pipeline
	 * @param readContent
	 * @return
	 */
	private static PCollection<InputContent> filterAlreadyProcessedUrls(
			PCollection<InputContent> readContent, Pipeline pipeline, 
			IndexerPipelineOptions options) {
		PCollection<InputContent> contentToProcess;
		String query = IndexerPipelineUtils.buildBigQueryProcessedUrlsQuery(options);
		PCollection<KV<String,Long>> alreadyProcessedUrls = pipeline
			.apply("Get processed URLs",BigQueryIO.read().fromQuery(query))
			.apply(ParDo.of(new GetUrlFn()));

		final PCollectionView<Map<String,Long>> alreadyProcessedUrlsSideInput =
			alreadyProcessedUrls.apply(View.<String,Long>asMap());
		  
		contentToProcess = readContent
			.apply(ParDo.of(new FilterProcessedUrls(alreadyProcessedUrlsSideInput))
				.withSideInputs(alreadyProcessedUrlsSideInput));
		return contentToProcess;
	}

	/**
	 * @param Document indexes
	 * @return a POJO containing 2 PCollections: Unique docs, and Duplicates
	 */
	private static ContentDuplicateOrNot filterSoftDuplicates(
			PCollection<ContentIndexSummary> indexes) {
		// 
		PCollectionTuple dedupeOrNot = indexes
			.apply("Extract Text grouping key", 
				ParDo.of(new GetContentIndexSummaryKeyFn()))
			.apply("Group by Text grouping key", 
				GroupByKey.<ContentSoftDeduplicationKey, ContentIndexSummary>create())
			.apply("Eliminate Text dupes", 
				ParDo.of(new EliminateTextDupes())
					.withOutputTags(PipelineTags.indexedContentNotToDedupeTag, 
						TupleTagList.of(PipelineTags.indexedContentToDedupeTag))); 	
			
		PCollection<TableRow> dedupedWebresources = 
			dedupeOrNot.get(PipelineTags.indexedContentToDedupeTag)
				.apply(ParDo.of(new CreateWebresourceTableRowFromDupeIndexSummaryFn()));
		
		ContentDuplicateOrNot contentDuplicateOrNot = new ContentDuplicateOrNot(
			dedupeOrNot.get(PipelineTags.indexedContentNotToDedupeTag),
			dedupedWebresources);
		
		return contentDuplicateOrNot;
	}
	
	
	/**
	 * @param options
	 * @return PTransform that reads from a JDBC source
	 */
	private static org.apache.beam.sdk.io.jdbc.JdbcIO.Read<InputContent> readDBRows(IndexerPipelineOptions options) {
		
		String query = IndexerPipelineUtils.buildJdbcSourceImportQuery(options);
		
		return JdbcIO.<InputContent>read()
		    .withDataSourceConfiguration(
		    	JdbcIO.DataSourceConfiguration.create(options.getJdbcDriverClassName(), options.getJdbcSourceUrl())
		    	.withUsername(options.getJdbcSourceUsername())
			    .withPassword(options.getJdbcSourcePassword())
		    )
		    .withQuery(query)
		    .withRowMapper(new RowMapper<InputContent>() {
		    	@Override
		    	public InputContent mapRow(ResultSet resultSet) throws Exception {
		    		InputContent result = new InputContent(
		        		resultSet.getString("url"),
		        		resultSet.getLong("pub_time")*1000L,
		        		resultSet.getString("title"),
		        		resultSet.getString("author"),
		        		resultSet.getString("language"),
		        		resultSet.getString("page_text"),
		        		resultSet.getString("doc_col_id"), 
		        		resultSet.getString("col_item_id"),
		        		resultSet.getInt("skip_indexing")
		    		); 
		    	  
		    		return result;
		    	}
		    })
		    .withCoder(AvroCoder.of(InputContent.class));
	}	
	
	
	
	/**
	 * Join two collections, using post id as the join key Sample:
	 * https://github.com/GoogleCloudPlatform/DataflowJavaSDK-examples/blob/master/src/main/java/com/google/cloud/dataflow/examples/cookbook/JoinExamples.java
	 */
	private static PCollection<InputContent> joinRedditPostsAndComments(Pipeline pipeline, IndexerPipelineOptions options) throws Exception {

		PCollection<TableRow> posts = null;
		PCollection<TableRow> comments = null;
		
		if (options.getRedditPostsQuery() != null)
			posts = pipeline.apply(BigQueryIO.read().fromQuery(options.getRedditPostsQuery()));
		else if (options.getRedditPostsTableName() != null)
			posts = pipeline.apply(BigQueryIO.read().from(options.getRedditPostsTableName()));
		
		if (options.getRedditCommentsQuery() != null)
			comments = pipeline.apply(BigQueryIO.read().fromQuery(options.getRedditCommentsQuery()));
		else if (options.getRedditCommentsTableName() != null)
			comments = pipeline.apply(BigQueryIO.read().from(options.getRedditCommentsTableName()));
		
		
		final TupleTag<TableRow> postInfoTag = new TupleTag<TableRow>();
		final TupleTag<TableRow> commentInfoTag = new TupleTag<TableRow>();

		// transform both input collections to tuple collections, where the keys
		// are the post-id
		PCollection<KV<String, TableRow>> postInfo = posts.apply(ParDo.of(new ExtractPostDataFn()));
		PCollection<KV<String, TableRow>> commentInfo = comments.apply(ParDo.of(new ExtractCommentInfoFn()));

		PCollection<KV<String, CoGbkResult>> kvpCollection = 
			KeyedPCollectionTuple
				.of(postInfoTag, postInfo)
				.and(commentInfoTag, commentInfo)
				.apply(CoGroupByKey.<String>create());

		// Process the CoGbkResult elements generated by the CoGroupByKey
		// transform.
		PCollection<InputContent> finalResultCollection = kvpCollection.apply(
				"Create InputContent from Posts and Comments",
				ParDo.of(new DoFn<KV<String, CoGbkResult>, InputContent>() {
					@ProcessElement
					public void processElement(ProcessContext c) {
						KV<String, CoGbkResult> e = c.element();

						// The CoGbkResult element contains all the data
						// associated with each unique key
						// from any of the input collections.
						String postId = e.getKey();

						// While we are expecting exactly one post record per
						// key, do some error handling here.
						TableRow post = e.getValue().getOnly(postInfoTag, null);
						if (post == null)
							return;

						// create a list that will hold all InputContent records
						ArrayList<InputContent> postAndCommentList = new ArrayList<InputContent>();

						String postPermalink = post.get("permalink").toString();
						Long postPubTime = IndexerPipelineUtils.extractRedditTime(post.get("created_utc").toString());
						
						/*
						 * sso 11/20/2017: Create two webresource records per post record
						 * 		The first WR record will have the external post URL, 
						 * 		the second one will have the reddit post URL 
						 */
						String postUrl = IndexerPipelineUtils.buildRedditPostUrl(postPermalink);
						String[] postMetaFields = IndexerPipelineUtils.extractRedditPostMetaFields(post);
						
						// Create the first InputContent for the post item itself
						InputContent icPost = new InputContent(/* url */ postUrl, /* pubTime */ postPubTime,
								/* title */ post.get("title").toString(), /* author */ post.get("author").toString(),
								/* language */ null, /* text */ post.get("selftext").toString(),
								/* documentCollectionId */ IndexerPipelineUtils.DOC_COL_ID_REDDIT_FH_BIGQUERY, /* collectionItemId */ postId,
								/* skipIndexing */ 0, /* parentUrl */ null, // the post record will become the beginning of the thread
								/* parentPubTime */ null, /* metaFields */ postMetaFields);

						postAndCommentList.add(icPost);

						// Build a map of Url and Publication Time for the post
						// and each comment
						HashMap<String, Long> pubTimes = new HashMap<String, Long>();
						// seed it with the post, which will be the parent of
						// some of the comments
						// in the chain
						pubTimes.put(postUrl, postPubTime);

						// Take advantage of the fact that all comments of a
						// post are local to this code
						// and build a map of pub times for each comment
						Iterable<TableRow> commentsOfPost = e.getValue().getAll(commentInfoTag);
						for (TableRow comment : commentsOfPost) {

							String commentUrl = IndexerPipelineUtils.buildRedditCommentUrl(postPermalink, comment.get("id").toString());
							Long commentPubTime = IndexerPipelineUtils.extractRedditTime(comment.get("created_utc").toString());
							String commentId = "t1_" + comment.get("id").toString();

							String parentId = comment.get("parent_id").toString();
							String parentUrl = (parentId.startsWith("t1_"))
									? IndexerPipelineUtils.buildRedditCommentUrl(postPermalink, comment.get("id").toString()) : postUrl;
							
							String[] commentMetaFields = IndexerPipelineUtils.extractRedditCommentMetaFields(comment);		
									
							InputContent icComment = new InputContent(/* url */ commentUrl,
								/* pubTime */ commentPubTime, /* title */ null,
								/* author */ comment.get("author").toString(), /* language */ null,
								/* text */ comment.get("body").toString(),
								/* documentCollectionId */ IndexerPipelineUtils.DOC_COL_ID_REDDIT_FH_BIGQUERY,
								/* collectionItemId */ commentId, /* skipIndexing */ 0, /* parentUrl */ parentUrl,
								/* parentPubTime */ null, // don't set time yet, because we might not have read that record yet
								/* metaFields */ commentMetaFields
							);

							pubTimes.put(commentUrl, commentPubTime); // save the pub time of the current comment
							postAndCommentList.add(icComment); // add comment to the list
						}

						// iterate through all posts and comments and populate
						// the Parent pub times
						for (InputContent ic : postAndCommentList) {
							if (ic.parentUrl != null)
								ic.parentPubTime = pubTimes.get(ic.parentUrl);
							c.output(ic);
						}

					}
				}));

		return finalResultCollection;
	}
	
	
	/**
	 * @param bqrows
	 * @param webresourceRowsUnindexed
	 * @param webresourceDeduped
	 * @param options
	 */
	private static void writeAllTablesToBigQuery(PCollectionTuple bqrows,
			PCollection<TableRow> webresourceRowsUnindexed, PCollection<TableRow> webresourceDeduped,
			IndexerPipelineOptions options) {
		PCollection<TableRow> webresourceRows = bqrows.get(PipelineTags.webresourceTag);
		PCollection<TableRow> documentRows = bqrows.get(PipelineTags.documentTag);
		PCollection<TableRow> sentimentRows = bqrows.get(PipelineTags.sentimentTag);

		// Now write to BigQuery
		WriteDisposition dispo = options.getWriteTruncate() ? 
			WriteDisposition.WRITE_TRUNCATE: WriteDisposition.WRITE_APPEND; 
		
		//Merge all collections with WebResource table records
		PCollectionList<TableRow> webresourceRowsList = (webresourceDeduped == null) ?
			PCollectionList.of(webresourceRows).and(webresourceRowsUnindexed) :
			PCollectionList.of(webresourceRows).and(webresourceRowsUnindexed).and(webresourceDeduped);
				
		PCollection<TableRow> allWebresourceRows = 
			webresourceRowsList.apply(Flatten.<TableRow>pCollections());
				
		allWebresourceRows = !options.isStreaming() ? 
			allWebresourceRows.apply("Reshuffle Webresources", new Reshuffle<TableRow>()) : 
			allWebresourceRows;
		
		allWebresourceRows
			.apply("Write to webresource", 
				BigQueryIO.writeTableRows()
					.to(getWebResourcePartitionedTableRef(options)) 
					.withSchema(getWebResourceSchema())
					.withCreateDisposition(CreateDisposition.CREATE_NEVER)
					.withWriteDisposition(dispo)); 
		
		documentRows = !options.isStreaming() ?
			documentRows.apply("Reshuffle Documents", new Reshuffle<TableRow>()):
			documentRows;
				
		documentRows
			.apply("Write to document", 
				BigQueryIO.writeTableRows()
					.to(getDocumentPartitionedTableRef(options))
					.withSchema(getDocumentTableSchema())
					.withCreateDisposition(CreateDisposition.CREATE_NEVER)
					.withWriteDisposition(dispo)); 
		
		sentimentRows = !options.isStreaming() ?
			sentimentRows.apply("Reshuffle Sentiments", new Reshuffle<TableRow>()):
			sentimentRows;
				
		sentimentRows
			.apply("Write to sentiment", 
				BigQueryIO.writeTableRows()
					.to(getSentimentPartitionedTableRef(options)) 
					.withSchema(getSentimentSchema())
					.withCreateDisposition(CreateDisposition.CREATE_NEVER)
					.withWriteDisposition(dispo));
	}



	/**
	 * Setup step {A}
	 * Helper method that defines the BigQuery schema used for the output.
	 */
	private static TableSchema getWebResourceSchema() {
		List<TableFieldSchema> fields = new ArrayList<>();
		fields.add(new TableFieldSchema().setName("WebResourceHash").setType("STRING").setMode("REQUIRED"));
		fields.add(new TableFieldSchema().setName("Url").setType("STRING"));
		fields.add(new TableFieldSchema().setName("PublicationTime").setType("TIMESTAMP").setMode("REQUIRED"));
		fields.add(new TableFieldSchema().setName("PublicationDateId").setType("INTEGER").setMode("REQUIRED"));
		fields.add(new TableFieldSchema().setName("ProcessingTime").setType("TIMESTAMP").setMode("REQUIRED"));
		fields.add(new TableFieldSchema().setName("ProcessingDateId").setType("INTEGER").setMode("REQUIRED"));
		fields.add(new TableFieldSchema().setName("DocumentHash").setType("STRING"));
		fields.add(new TableFieldSchema().setName("DocumentCollectionId").setType("STRING"));
		fields.add(new TableFieldSchema().setName("CollectionItemId").setType("STRING"));
		fields.add(new TableFieldSchema().setName("Title").setType("STRING"));
		fields.add(new TableFieldSchema().setName("Domain").setType("STRING"));
		fields.add(new TableFieldSchema().setName("Author").setType("STRING"));
		fields.add(new TableFieldSchema().setName("ParentWebResourceHash").setType("STRING"));
		fields.add(new TableFieldSchema().setName("MetaFields").setType("STRING").setMode("REPEATED"));

		TableSchema schema = new TableSchema().setFields(fields);
		return schema;
	}	

	/**
	 * Setup step {A}
	 * Helper method that defines the BigQuery schema used for the output.
	 */
	private static TableSchema getDocumentTableSchema() {
		List<TableFieldSchema> fields = new ArrayList<>();
		fields.add(new TableFieldSchema().setName("DocumentHash").setType("STRING").setMode("REQUIRED"));
		fields.add(new TableFieldSchema().setName("PublicationTime").setType("TIMESTAMP").setMode("REQUIRED"));
		fields.add(new TableFieldSchema().setName("PublicationDateId").setType("INTEGER").setMode("REQUIRED"));
		fields.add(new TableFieldSchema().setName("ProcessingTime").setType("TIMESTAMP").setMode("REQUIRED"));
		fields.add(new TableFieldSchema().setName("ProcessingDateId").setType("INTEGER").setMode("REQUIRED"));
		fields.add(new TableFieldSchema().setName("DocumentCollectionId").setType("STRING"));
		fields.add(new TableFieldSchema().setName("CollectionItemId").setType("STRING"));
		fields.add(new TableFieldSchema().setName("Title").setType("STRING"));
		fields.add(new TableFieldSchema().setName("Type").setType("INTEGER"));
		fields.add(new TableFieldSchema().setName("Language").setType("STRING"));
		fields.add(new TableFieldSchema().setName("ParseDepth").setType("INTEGER"));
		fields.add(new TableFieldSchema().setName("ContentLength").setType("INTEGER"));
		fields.add(new TableFieldSchema().setName("Author").setType("STRING"));
		fields.add(new TableFieldSchema().setName("Text").setType("STRING"));
		fields.add(new TableFieldSchema().setName("MainWebResourceHash").setType("STRING"));
		fields.add(new TableFieldSchema().setName("ParentWebResourceHash").setType("STRING"));
		
		List<TableFieldSchema> tagsFields = new ArrayList<>();
		tagsFields.add(new TableFieldSchema().setName("Tag").setType("STRING"));
		tagsFields.add(new TableFieldSchema().setName("Weight").setType("FLOAT"));
		tagsFields.add(new TableFieldSchema().setName("GoodAsTopic").setType("BOOLEAN"));
		fields.add(new TableFieldSchema().setName("Tags").setType("RECORD").setFields(tagsFields).setMode("REPEATED"));

		TableSchema schema = new TableSchema().setFields(fields);
		return schema;
	}	
	
	
	/**
	 * Setup step {A}
	 * Helper method that defines the BigQuery schema used for the output.
	 */
	private static TableSchema getSentimentSchema() {
		List<TableFieldSchema> fields = new ArrayList<>();
		fields.add(new TableFieldSchema().setName("SentimentHash").setType("STRING").setMode("REQUIRED"));
		fields.add(new TableFieldSchema().setName("DocumentHash").setType("STRING").setMode("REQUIRED"));
		fields.add(new TableFieldSchema().setName("DocumentTime").setType("TIMESTAMP").setMode("REQUIRED"));
		fields.add(new TableFieldSchema().setName("DocumentDateId").setType("INTEGER").setMode("REQUIRED"));
		fields.add(new TableFieldSchema().setName("Text").setType("STRING").setMode("REQUIRED"));
		fields.add(new TableFieldSchema().setName("LabelledPositions").setType("STRING"));
		fields.add(new TableFieldSchema().setName("AnnotatedText").setType("STRING"));
		fields.add(new TableFieldSchema().setName("AnnotatedHtml").setType("STRING"));
		fields.add(new TableFieldSchema().setName("SentimentTotalScore").setType("INTEGER"));
		fields.add(new TableFieldSchema().setName("DominantValence").setType("INTEGER"));
		fields.add(new TableFieldSchema().setName("StAcceptance").setType("INTEGER"));
		fields.add(new TableFieldSchema().setName("StAnger").setType("INTEGER"));
		fields.add(new TableFieldSchema().setName("StAnticipation").setType("INTEGER"));
		fields.add(new TableFieldSchema().setName("StAmbiguous").setType("INTEGER"));
		fields.add(new TableFieldSchema().setName("StDisgust").setType("INTEGER"));
		fields.add(new TableFieldSchema().setName("StFear").setType("INTEGER"));
		fields.add(new TableFieldSchema().setName("StGuilt").setType("INTEGER"));
		fields.add(new TableFieldSchema().setName("StInterest").setType("INTEGER"));
		fields.add(new TableFieldSchema().setName("StJoy").setType("INTEGER"));
		fields.add(new TableFieldSchema().setName("StSadness").setType("INTEGER"));
		fields.add(new TableFieldSchema().setName("StShame").setType("INTEGER"));
		fields.add(new TableFieldSchema().setName("StSurprise").setType("INTEGER"));
		fields.add(new TableFieldSchema().setName("StPositive").setType("INTEGER"));
		fields.add(new TableFieldSchema().setName("StNegative").setType("INTEGER"));
		fields.add(new TableFieldSchema().setName("StSentiment").setType("INTEGER"));
		fields.add(new TableFieldSchema().setName("StProfane").setType("INTEGER"));
		fields.add(new TableFieldSchema().setName("StUnsafe").setType("INTEGER"));
		fields.add(new TableFieldSchema().setName("MainWebResourceHash").setType("STRING"));
		fields.add(new TableFieldSchema().setName("ParentWebResourceHash").setType("STRING"));
		
		List<TableFieldSchema> tagsFields = new ArrayList<>();
		tagsFields.add(new TableFieldSchema().setName("Tag").setType("STRING"));
		tagsFields.add(new TableFieldSchema().setName("GoodAsTopic").setType("BOOLEAN"));
		fields.add(new TableFieldSchema().setName("Tags").setType("RECORD").setFields(tagsFields).setMode("REPEATED"));
		
		fields.add(new TableFieldSchema().setName("Signals").setType("STRING").setMode("REPEATED"));

		TableSchema schema = new TableSchema().setFields(fields);
		return schema;
	}	

	
	static class ContentToIndexOrNot {
		public final PCollection<InputContent> contentToIndex;
		public final PCollection<InputContent> contentNotToIndex;
		
		public ContentToIndexOrNot(PCollection<InputContent> contentToIndex, PCollection<InputContent> contentNotToIndex){
			this.contentToIndex = contentToIndex;
			this.contentNotToIndex = contentNotToIndex;
		}
	}

	static class ContentDuplicateOrNot {
		public final PCollection<ContentIndexSummary> uniqueIndexes;
		public final PCollection<TableRow> duplicateWebresources;
		
		public ContentDuplicateOrNot(PCollection<ContentIndexSummary> uniqueIndexes, PCollection<TableRow> duplicateWebresources){
			this.uniqueIndexes = uniqueIndexes;
			this.duplicateWebresources = duplicateWebresources;
		}
	}
	

	static class LogPipelineOptions extends DoFn<Integer, Void> {
		@ProcessElement
		public void processElement(ProcessContext c) {
			LOG.info("LogPipelineOptions: " + c.getPipelineOptions().toString());
		}			
	}

	/**
	 * Create  items to be stored in Bigtable dead letter table unprocessed-documents
	 * @author sezok
	 *
	 */
	static class CreateDeadLetterEntries extends DoFn<InputContent, KV<ByteString, Iterable<Mutation>>> {
		
		@ProcessElement
		public void processElement(ProcessContext c) {
			InputContent i = c.element();
			String jobName = c.getPipelineOptions().getJobName();
			ByteString rowkey = ByteString.copyFromUtf8(jobName + "#" + i.expectedDocumentHash); 
			ByteString value = ByteString.copyFromUtf8(i.text);
			
			Iterable<Mutation> mutations =
				ImmutableList.of(Mutation.newBuilder()
					.setSetCell(
						Mutation.SetCell.newBuilder()
							.setFamilyName(IndexerPipelineUtils.DEAD_LETTER_TABLE_ERR_CF)
							.setColumnQualifier(ByteString.copyFromUtf8("text"))
							.setValue(value)
					)
	                .build());
			
			c.output(KV.of(rowkey, mutations));			
		}
	}
	
	
	/**
	 * Check in the map if we already processed this Url, and if we haven't, 
	 * add the input content to the list that needs to be processed 
	 * @author sezok
	 *
	 */
	static class FilterProcessedUrls extends DoFn<InputContent, InputContent> {
		
		final PCollectionView<Map<String,Long>> alreadyProcessedUrlsSideInput;
		
		public FilterProcessedUrls(PCollectionView<Map<String,Long>> si) {
			this.alreadyProcessedUrlsSideInput = si;
		}
		
		@ProcessElement
		public void processElement(ProcessContext c) {
			InputContent i = c.element();
			Long proTime = c.sideInput(this.alreadyProcessedUrlsSideInput).get(i.url);
			if (proTime == null)
				c.output(i);
		}
	}
	

	static class EliminateInputContentDupes extends DoFn<KV<String,Iterable<InputContent>>, InputContent> {
		
		final PCollectionView<Map<String,Long>> alreadyProcessedDocsSideInput;
		
		public EliminateInputContentDupes(PCollectionView<Map<String,Long>> si) {
			this.alreadyProcessedDocsSideInput = si;
		}
		
		@ProcessElement
		public void processElement(ProcessContext c) {
			KV<String, Iterable<InputContent>> kv = c.element();
			String documentHash = kv.getKey();
			Iterable<InputContent> dupes = kv.getValue();
			boolean isFirst = true;
			int groupSize = Iterables.size(dupes);
			for (InputContent ic : dupes) {
	
				// Check if this doc was already processed and stored in BQ
				Map<String,Long> sideInputMap = c.sideInput(alreadyProcessedDocsSideInput);
				Long proTime = sideInputMap.get(ic.expectedDocumentHash);
				if (proTime!=null) {
					c.output(PipelineTags.contentNotToIndexExactDupesTag,ic);
					continue;
				}
				
				if (isFirst) {
					isFirst = false;
					c.output(ic);
				} else {
					c.output(PipelineTags.contentNotToIndexExactDupesTag,ic);
				}
			}
		}
	}
	
	
	
	/**
	 * EliminateTextDupes - a ParDo that takes a group of text documents and selects one that 
	 * will represent all of them
	 */
	
	static class EliminateTextDupes extends DoFn<KV<ContentSoftDeduplicationKey,Iterable<ContentIndexSummary>>, ContentIndexSummary> {

		@ProcessElement
		public void processElement(ProcessContext c) {
			KV<ContentSoftDeduplicationKey, Iterable<ContentIndexSummary>> kv = c.element();
			ContentSoftDeduplicationKey key = kv.getKey();
			Iterable<ContentIndexSummary> group = kv.getValue();
											
			// Calculate stats for Tags and determine the shortest text
			HashMap<String,Integer> tagStats = new HashMap<String,Integer>();
			Integer minLength = Integer.MAX_VALUE;
			Integer groupSize = Iterables.size(group);

			// for single element document groups stop the checks right here
			if (groupSize == 1) {
				c.output(group.iterator().next());
				return;
			}
			
			for (ContentIndexSummary is : group) {
				// build tag stats
				for (DocumentTag dt : is.doc.tags){
					Integer i = tagStats.get(dt.tag);
					if (i == null)
						tagStats.put(dt.tag, 1);
					else
						tagStats.put(dt.tag, i + 1);
				}
			}
											
			// Iterate through the group again, this time checking for passing criteria by Tags
			// For a tag to count as a good match, it needs to occur in half the documents of the group
			Integer minTagOccurences = Math.max(Math.round(groupSize/2),2);
			ContentIndexSummary shortestMatch = null;
			ArrayList<ContentIndexSummary> indexesToRemap = new ArrayList<ContentIndexSummary>();
			
			for (ContentIndexSummary is : group) {
				Integer matchedTags = 0;
				Integer totalTags = is.doc.tags.length;
				for (DocumentTag dt : is.doc.tags){
					Integer tagOcc = tagStats.get(dt.tag);
					if (tagOcc >= minTagOccurences)
						matchedTags++;
				}
				
				Float matchedRatio = ((float) matchedTags / (float) totalTags);
				
				if (matchedRatio >= 0.5 && matchedTags >= 2) {
					// Documents that have an acceptable match on tags should be remapped to a single document
					// The shortest doc becomes the winning document, and all other docs will be
					// remapped to this document
					if (is.doc.contentLength < minLength) {
						if (shortestMatch != null)
							indexesToRemap.add(shortestMatch); // push the previous shortie to the list of remaps
						shortestMatch = is;
						minLength = is.doc.contentLength;
					} else {
						// For now add the current doc to a list, because we might not have hit the 
						// shortest document yet 
						indexesToRemap.add(is);
					}
				} else {
					// this doc does not pass the deduplication criteria, so release it into the main output
					c.output(is);
				}
			}
			
			if (shortestMatch!=null) {
				c.output(shortestMatch); // the shortest match goes to main output
				String shortestMatchHash = shortestMatch.doc.documentHash;
				for (ContentIndexSummary is : indexesToRemap) {
					KV<String,ContentIndexSummary> kvRemap = KV.of(shortestMatchHash,is);
					c.output(PipelineTags.indexedContentToDedupeTag,kvRemap);
				}
			}
			
		}
		
		
	}
	
	
	/**
	 * 
	 * IndexDocument - a ParDo that analyzes just one document at a time
	 * and produces its Sentiment Analysis summary
	 */
	
	static class IndexDocument extends DoFn<InputContent, ContentIndexSummary> {

		@ProcessElement
		public void processElement(ProcessContext c) {

			ContentIndex contentindex = null;
			ContentIndexSummary summary = null;
			InputContent ic = null;
			IndexerPipelineOptions options = c.getPipelineOptions().as(IndexerPipelineOptions.class);
			IndexingConsts.ContentType contentType = options.getIndexAsShorttext() ? IndexingConsts.ContentType.SHORTTEXT: IndexingConsts.ContentType.ARTICLE;
			
			try {
				long processingTime = System.currentTimeMillis();

				ic = c.element();
				
				contentindex = new ContentIndex(
					ic.text, 
					IndexingConsts.IndexingType.TOPSENTIMENTS,
					contentType,
					processingTime,
					ic.url,
					ic.pubTime,
					ic.title,
					ic.author,
					ic.documentCollectionId,
					ic.collectionItemId,
					ic.parentUrl,
					ic.parentPubTime,
					ic.metaFields);
				
				Indexer.index(contentindex); // Call to the NLP package
				
				if (!contentindex.IsIndexingSuccessful)
					throw new Exception(contentindex.IndexingErrors + ". Text: "+ic.text);
				
				summary = contentindex.getContentIndexSummary();
				
				long indexingDuration = System.currentTimeMillis() - processingTime;
				if (indexingDuration > IndexerPipeline.REPORT_LONG_INDEXING_DURATION) {
					LOG.warn("IndexDocument.processElement: Indexing took " + indexingDuration + " milliseconds.");
				    StringBuilder sb = new StringBuilder();
				    LogUtils.printIndex(1, contentindex, sb);
				    String docIndex = sb.toString();
				    LOG.warn("IndexDocument.processElement: Contents of Index ["+indexingDuration+" ms]: " + docIndex);
				}
				
				if (summary == null)
					throw new Exception("null ContentIndexSummary returned");
				else
					c.output(summary);
				
			} catch (Exception e) {
				LOG.warn("IndexDocument.processElement:",e);
				c.output(PipelineTags.unsuccessfullyIndexed, ic);
			}
			
		}
	}

	/**
	 * 
	 * ProcessRawInput - a DoFn that extracts attributes like URL, Title, Author from raw text
	 * and puts them into InputContent
	 */
	
	static class ParseRawInput extends DoFn<String,InputContent> {

		@ProcessElement
		public void processElement(ProcessContext c) {

			String rawInput = null;
			InputContent iContent = null;
			
			try {
				rawInput = c.element();
				if (rawInput == null)
					throw new Exception("ProcessRawInput: null raw content");
				rawInput = rawInput.trim();
				if (rawInput.isEmpty())
					throw new Exception("ProcessRawInput: empty raw content or whitespace chars only");
				iContent = InputContent.createInputContent(rawInput);

			} catch (Exception e) {
				LOG.warn(e.getMessage());
			}
			
			if (iContent != null) 
				c.output(iContent);
		}
		

	}

	public static class ParseCSVFile extends DoFn<String,InputContent> {

		@ProcessElement
		public void processElement(ProcessContext c) {

			String rawInput = null;
			InputContent iContent = null;
			
			try {
				rawInput = c.element();
				if (rawInput == null)
					throw new Exception("ParseCSVFile: null raw content");
				
				
				FileIndexerPipelineOptions options = c.getPipelineOptions().as(FileIndexerPipelineOptions.class);
				Integer textColumnIdx = options.getTextColumnIdx();
				Integer collectionItemIdIdx = options.getCollectionItemIdIdx();
				
				InputStreamReader isr = new InputStreamReader(IOUtils.toInputStream(rawInput,StandardCharsets.UTF_8.name()));
				
				Iterable<CSVRecord> records = CSVFormat.DEFAULT
					.withFirstRecordAsHeader()
					.parse(isr);
				
				for (CSVRecord record : records) {
					
					String text = record.get(textColumnIdx);
					String documentCollectionId = IndexerPipelineUtils.DOC_COL_ID_CSV_FILE;
					String collectionItemId = (collectionItemIdIdx!=null)? record.get(collectionItemIdIdx): null;
					
					InputContent ic = new InputContent(
						null /*url*/, null /*pubTime*/, null /*title*/, null /*author*/, null /*language*/, 
						text, documentCollectionId, collectionItemId, 0 /*skipIndexing*/);					
					
					c.output(ic);
				}
				

			} catch (Exception e) {
				LOG.warn(e.getMessage());
			}
		}
		

	}
	
	/**
	 * 
	 * Use in the future, when we are able to parallelize import at the record file source
	 * @author sezok
	 *
	 */
	static class ParseCSVLine extends DoFn<String,InputContent> {

		/*
		@Setup
		public void setup(){
		}
		
		@Teardown
		public void teardown(){
		}
		*/
		
		@ProcessElement
		public void processElement(ProcessContext c) {

			String rawInput = null;
			InputContent iContent = null;
			
			try {
				rawInput = c.element();
				if (rawInput == null)
					throw new Exception("ParseCSVLine: null raw content");
				rawInput = rawInput.trim();
				if (rawInput.isEmpty())
					throw new Exception("ParseCSVLine: empty raw content or whitespace chars only");
				
				FileIndexerPipelineOptions options = c.getPipelineOptions().as(FileIndexerPipelineOptions.class);
				Integer textColumnIdx = options.getTextColumnIdx();
				Integer collectionItemIdIdx = options.getCollectionItemIdIdx();
				
				InputStreamReader isr = new InputStreamReader(IOUtils.toInputStream(rawInput,StandardCharsets.UTF_8.name()));
				
				Iterable<CSVRecord> records = CSVFormat.DEFAULT.parse(isr);
				
				for (CSVRecord record : records) { // should only be one record, but handle multi-record case as well
					
					String text = record.get(textColumnIdx);
					String documentCollectionId = IndexerPipelineUtils.DOC_COL_ID_CSV_FILE;
					String collectionItemId = record.get(collectionItemIdIdx);
					
					InputContent ic = new InputContent(
						null /*url*/, null /*pubTime*/, null /*title*/, null /*author*/, null /*language*/, 
						text, documentCollectionId, collectionItemId, 0 /*skipIndexing*/);					
					
					c.output(ic);
				}
				

			} catch (Exception e) {
				LOG.warn(e.getMessage());
			}
		}

	}
	
	/**
	 * 
	 * ParseGDELTJsonInput - a DoFn that extracts attributes like URL, Title, Author from JSON 
	 * in GDELT format and puts them into InputContent
	 */
	
	static class ParseGDELTJsonInput extends DoFn<String,InputContent> {

		@ProcessElement
		public void processElement(ProcessContext c) {

			String rawInput = null;
			InputContent iContent = null;
			
			try {
				rawInput = c.element();
				if (rawInput == null || rawInput == "")
					throw new Exception("ParseGDELTJsonInput: null or empty raw content");

				iContent = InputContent.createInputContentFromGDELTJson(rawInput);
				
				// Skip non-English content for now
				if (!iContent.language.equals("EN"))
					iContent = null;

			} catch (Exception e) {
				LOG.warn(e.getMessage());
			}
			
			if (iContent != null) 
				c.output(iContent);
		}
		

	}
	
	/**
	 * Pipeline step 3
	 * FormatAsTableRowFn - a DoFn for converting a sentiment summary into a BigQuery WebResources record
	 */

	static class CreateTableRowsFromIndexSummaryFn extends DoFn<ContentIndexSummary, TableRow> {
		@ProcessElement
		public void processElement(ProcessContext c) {
			ContentIndexSummary summary = c.element();

			// Create the webresource entry
			Instant pubTime = new Instant(summary.wr.publicationTime);
			Instant proTime = new Instant(summary.wr.processingTime);
			
			TableRow wrrow = new TableRow()
				.set("WebResourceHash", summary.wr.webResourceHash)
				.set("PublicationTime", pubTime.toString())
				.set("PublicationDateId", summary.wr.publicationDateId)
				.set("ProcessingTime", proTime.toString())
				.set("ProcessingDateId", summary.wr.processingDateId)
				.set("DocumentHash", summary.wr.documentHash);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(wrrow, "Url", summary.wr.url);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(wrrow, "DocumentCollectionId", summary.wr.documentCollectionId);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(wrrow, "CollectionItemId", summary.wr.collectionItemId);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(wrrow, "Title", summary.wr.title);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(wrrow, "Domain", summary.wr.domain);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(wrrow, "Author", summary.wr.author);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(wrrow, "ParentWebResourceHash", summary.wr.parentWebResourceHash);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(wrrow, "MetaFields", summary.wr.metaFields);
			
			c.output(wrrow);

			// Create the document entry
			pubTime = new Instant(summary.doc.publicationTime);
			proTime = new Instant(summary.doc.processingTime);
		
			List<TableRow> tags = new ArrayList<>();
			if (summary.doc.tags != null) 
				for (int i=0; i < summary.doc.tags.length; i++) {
					TableRow row = new TableRow();
					row.set("Tag",summary.doc.tags[i].tag);
					row.set("Weight",summary.doc.tags[i].weight);
					IndexerPipelineUtils.setTableRowFieldIfNotNull(row,"GoodAsTopic",summary.doc.tags[i].goodAsTopic);
					tags.add(row);
				}
					
			TableRow drow = new TableRow()
					.set("DocumentHash", summary.doc.documentHash)
					.set("PublicationTime", pubTime.toString())
					.set("PublicationDateId", summary.doc.publicationDateId)
					.set("ProcessingTime", proTime.toString())
					.set("ProcessingDateId", summary.doc.processingDateId);
			
			IndexerPipelineUtils.setTableRowFieldIfNotNull(drow,"DocumentCollectionId", summary.doc.documentCollectionId);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(drow,"CollectionItemId", summary.doc.collectionItemId);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(drow,"Title", summary.doc.title);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(drow,"Type", summary.doc.type.ordinal());
			IndexerPipelineUtils.setTableRowFieldIfNotNull(drow,"Language", summary.doc.language);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(drow,"ParseDepth", summary.doc.contentParseDepth.ordinal());
			IndexerPipelineUtils.setTableRowFieldIfNotNull(drow,"ContentLength", summary.doc.contentLength);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(drow,"Author", summary.wr.author);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(drow,"Text", summary.doc.text);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(drow,"MainWebResourceHash", summary.doc.mainWebResourceHash);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(drow,"ParentWebResourceHash", summary.doc.parentWebResourceHash);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(drow,"Tags", tags);
			
			c.output(PipelineTags.documentTag, drow);
					
			if (summary.sentiments != null) {
				for (int i=0; i < summary.sentiments.length; i++)
				{
					List<TableRow> sttags = new ArrayList<>();
					if (summary.sentiments[i].tags != null) 
						for (int j=0; j < summary.sentiments[i].tags.length; j++) {
							TableRow strow = new TableRow();
							strow.set("Tag",summary.sentiments[i].tags[j].tag);
							IndexerPipelineUtils.setTableRowFieldIfNotNull(strow,"GoodAsTopic",summary.sentiments[i].tags[j].goodAsTopic);
							sttags.add(strow);
						}
					
					Instant docTime = new Instant(summary.sentiments[i].documentTime);
					
					TableRow strow = new TableRow()
						.set("SentimentHash", summary.sentiments[i].sentimentHash)
						.set("DocumentHash", summary.sentiments[i].documentHash)
						.set("DocumentTime", docTime.toString())
						.set("DocumentDateId", summary.sentiments[i].documentDateId)
						.set("Text", summary.sentiments[i].text)
						.set("LabelledPositions", summary.sentiments[i].labelledPositions)
						.set("AnnotatedText", summary.sentiments[i].annotatedText)
						.set("AnnotatedHtml", summary.sentiments[i].annotatedHtmlText)
						.set("SentimentTotalScore", summary.sentiments[i].sentimentTotalScore)
						.set("DominantValence", summary.sentiments[i].dominantValence.ordinal())
						.set("StAcceptance", summary.sentiments[i].stAcceptance)
						.set("StAnger", summary.sentiments[i].stAnger)
						.set("StAnticipation", summary.sentiments[i].stAnticipation)
						.set("StAmbiguous", summary.sentiments[i].stAmbiguous)
						.set("StDisgust", summary.sentiments[i].stDisgust)
						.set("StFear", summary.sentiments[i].stFear)
						.set("StGuilt", summary.sentiments[i].stGuilt)
						.set("StInterest", summary.sentiments[i].stInterest)
						.set("StJoy", summary.sentiments[i].stJoy)
						.set("StSadness", summary.sentiments[i].stSadness)
						.set("StShame", summary.sentiments[i].stShame)
						.set("StSurprise", summary.sentiments[i].stSurprise)
						.set("StPositive", summary.sentiments[i].stPositive)
						.set("StNegative", summary.sentiments[i].stNegative)
						.set("StSentiment", summary.sentiments[i].stSentiment)
						.set("StProfane", summary.sentiments[i].stProfane)
						.set("StUnsafe", summary.sentiments[i].stUnsafe);
					
					IndexerPipelineUtils.setTableRowFieldIfNotNull(strow,"MainWebResourceHash", summary.sentiments[i].mainWebResourceHash);
					IndexerPipelineUtils.setTableRowFieldIfNotNull(strow,"ParentWebResourceHash", summary.sentiments[i].parentWebResourceHash);
					IndexerPipelineUtils.setTableRowFieldIfNotNull(strow,"Tags", sttags);
					
					IndexerPipelineUtils.setTableRowFieldIfNotNull(strow,"Signals", summary.sentiments[i].signals);

					c.output(PipelineTags.sentimentTag, strow);
					
				}
			}
		}
	}

	
	static class CreateWebresourceTableRowFromDupeIndexSummaryFn extends DoFn<KV<String,ContentIndexSummary>, TableRow> {
		@ProcessElement
		public void processElement(ProcessContext c) {
			KV<String,ContentIndexSummary> kv = c.element();
			String newDocumentHash = kv.getKey();
			ContentIndexSummary summary = kv.getValue();

			// Create the webresource entry
			Instant pubTime = new Instant(summary.wr.publicationTime);
			Instant proTime = new Instant(summary.wr.processingTime);

			// TODO: we are leaving summary.wr.collectionItemId, summary.wr.publicationDateId unchanged for now 
			// These values are different from the Document to which we repointed the WebResource
			// It could be a good or a bad thing, depending on the circumstances
			
			TableRow wrrow = new TableRow()
				.set("WebResourceHash", summary.wr.webResourceHash)
				.set("Url", summary.wr.url)
				.set("PublicationTime", pubTime.toString())
				.set("PublicationDateId", summary.wr.publicationDateId)
				.set("ProcessingTime", proTime.toString())
				.set("ProcessingDateId", summary.wr.processingDateId)
				.set("DocumentHash", newDocumentHash); // replace the original DocumentHash with the passed value
			
			IndexerPipelineUtils.setTableRowFieldIfNotNull(wrrow, "DocumentCollectionId", summary.wr.documentCollectionId);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(wrrow, "CollectionItemId", summary.wr.collectionItemId);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(wrrow, "Title", summary.wr.title);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(wrrow, "Domain", summary.wr.domain);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(wrrow, "Author", summary.wr.author);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(wrrow, "ParentWebResourceHash", summary.wr.parentWebResourceHash);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(wrrow, "MetaFields", summary.wr.metaFields);

			c.output(wrrow);
		}
	}
	
	static class CreateWebresourceTableRowFromInputContentFn extends DoFn<InputContent, TableRow> {
		@ProcessElement
		public void processElement(ProcessContext c) {
			InputContent ic = c.element();
			long processingTime = System.currentTimeMillis();
			
			WebResource wr = new WebResource();
			
			// retrieve the Parent Web Resource Hash and Document Hash, if available
			String parentWebResourceHash = ic.expectedParentWebResourceHash;
			String documentHash = ic.expectedDocumentHash; 
			
			wr.initialize(ic.url, ic.pubTime, processingTime, 
				documentHash, ic.documentCollectionId, ic.collectionItemId,
				ic.title, ic.author, parentWebResourceHash, ic.metaFields);

			Instant pubTime = new Instant(wr.publicationTime);
			Instant proTime = new Instant(wr.processingTime);
			
			TableRow wrrow = new TableRow()
				.set("WebResourceHash", wr.webResourceHash)
				.set("Url", wr.url)
				.set("PublicationTime", pubTime.toString())
				.set("PublicationDateId", wr.publicationDateId)
				.set("ProcessingTime", proTime.toString())
				.set("ProcessingDateId", wr.processingDateId)
				.set("DocumentHash", wr.documentHash);
			
			IndexerPipelineUtils.setTableRowFieldIfNotNull(wrrow,"DocumentCollectionId", wr.documentCollectionId);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(wrrow,"CollectionItemId", wr.collectionItemId);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(wrrow,"Title", wr.title);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(wrrow,"Domain", wr.domain);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(wrrow,"Author", wr.author);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(wrrow,"ParentWebResourceHash", wr.parentWebResourceHash);
			IndexerPipelineUtils.setTableRowFieldIfNotNull(wrrow,"MetaFields", wr.metaFields);

			c.output(wrrow);
			
		}
	}

	/**
	 * 
	 */

	static class GetUrlFn extends DoFn<TableRow, KV<String,Long>> {
		
		@ProcessElement
		public void processElement(ProcessContext c) {
			TableRow row = c.element();
			String url = IndexerPipelineUtils.getTableRowStringFieldIfNotNull(row,"Url");
			String processingTime = IndexerPipelineUtils.getTableRowStringFieldIfNotNull(row,"ProcessingTime");
			if (url != null && !url.isEmpty())
			{
				Long l = IndexerPipelineUtils.parseDateToLong(IndexerPipelineUtils.dateTimeFormatYMD_HMS_MSTZ, processingTime);
				if (l == null) l = 1L;
				KV<String,Long> kv = KV.of(url, l);
				c.output(kv);
			}
		}
	}

	static class GetDocumentHashFn extends DoFn<TableRow, KV<String,Long>> {
		
		@ProcessElement
		public void processElement(ProcessContext c) {
			TableRow row = c.element();
			String documentHash = row.get("DocumentHash").toString();
			String processingTime = row.get("ProcessingTime").toString();
			if (documentHash != null && !documentHash.isEmpty())
			{
				Long l = IndexerPipelineUtils.parseDateToLong(IndexerPipelineUtils.dateTimeFormatYMD_HMS_MSTZ, processingTime);
				if (l == null) l = 1L;
				KV<String,Long> kv = KV.of(documentHash, l);
				c.output(kv);
			}
		}
	}

	/**
	 * 
	 */

	static class GetInputContentDocumentHashFn extends DoFn<InputContent, KV<String,InputContent>> {
		@ProcessElement
		public void processElement(ProcessContext c) {
			KV<String,InputContent> kv = KV.of(c.element().expectedDocumentHash, c.element());
			c.output(kv);
		}
	}
	
	/**
	 * Re-implementation of GetContentIndexSummaryKeyFn that produces a composite key 
	 * ala https://cloud.google.com/blog/big-data/2017/08/guide-to-common-cloud-dataflow-use-case-patterns-part-2#pattern-groupby-using-multiple-data-properties
	 */
	
	static class GetContentIndexSummaryKeyFn extends DoFn<ContentIndexSummary, KV<ContentSoftDeduplicationKey,ContentIndexSummary>> {
		@ProcessElement
		public void processElement(ProcessContext c) {
			String origTitle = c.element().doc.title;
			String title = (origTitle == null) || origTitle.isEmpty() ? EMPTY_TITLE_KEY_PREFIX : origTitle;
			Integer contentLengthType = Math.round ( c.element().doc.contentLength / 1000);
			ContentSoftDeduplicationKey key = new ContentSoftDeduplicationKey(title, contentLengthType);
			KV<ContentSoftDeduplicationKey,ContentIndexSummary> kv = KV.of(key, c.element());
			c.output(kv);
		}
	}

	@DefaultCoder(AvroCoder.class)
	static class ContentSoftDeduplicationKey {
		public String title;
		public Integer contentLengthType;
		
		public ContentSoftDeduplicationKey() {}
		
		public ContentSoftDeduplicationKey (String title, Integer contentLengthType) {
			this.title = title;
			this.contentLengthType = contentLengthType;
		}
	}
	
	
	/**
	 * Original implementation of GetContentIndexSummaryKeyFn that produces concatenated string [title + "-" Math.round(contentLength/1000)]
	 */

	/*
	static class GetContentIndexSummaryKeyFn extends DoFn<ContentIndexSummary, KV<String,ContentIndexSummary>> {
		@ProcessElement
		public void processElement(ProcessContext c) {
			String title = c.element().doc.title;
			Integer contentLength = c.element().doc.contentLength;
			String key = (((title == null) || title.isEmpty()) ? EMPTY_TITLE_KEY_PREFIX : title) + " + " + Math.round(contentLength/1000);
			KV<String,ContentIndexSummary> kv = KV.of(key, c.element());
			c.output(kv);
		}
	}
	*/
	
	/**
	 * 
	 */

	static class FilterItemsToIndex extends DoFn<InputContent, InputContent> {
		@ProcessElement
		public void processElement(ProcessContext c) {
			InputContent ic = c.element();
			if (ic.skipIndexing == 0)
				c.output(ic);
			else
				c.output(PipelineTags.contentNotToIndexSkippedTag, ic);
		}
	}
	
	
	private static PartitionedTableRef getWebResourcePartitionedTableRef(IndexerPipelineOptions options) {
		
		return PartitionedTableRef.perDay(
			options.getProject(), options.getBigQueryDataset(),
			IndexerPipelineUtils.WEBRESOURCE_TABLE,
			"PublicationDateId", false);
	}

	private static PartitionedTableRef getDocumentPartitionedTableRef(IndexerPipelineOptions options) {
		
		return PartitionedTableRef.perDay(
			options.getProject(), options.getBigQueryDataset(),
			IndexerPipelineUtils.DOCUMENT_TABLE,
			"PublicationDateId", false);
	}
	
	private static PartitionedTableRef getSentimentPartitionedTableRef(IndexerPipelineOptions options) {
		
		return PartitionedTableRef.perDay(
			options.getProject(), options.getBigQueryDataset(),
			IndexerPipelineUtils.SENTIMENT_TABLE,
			"DocumentDateId", false);
	}
	
	
	/**
	 * Call CloudNLP
	 *
	 */
	static class EnrichWithCNLPEntities extends DoFn<ContentIndexSummary, ContentIndexSummary> {

		private LanguageServiceClient languageClient;

		@StartBundle
		public void startBundle(){
			try {
				this.languageClient = LanguageServiceClient.create();
			} catch (Exception e) {
				LOG.warn(e.getMessage());
			}
			
		}
		
		@FinishBundle
		public void finishBundle(){
			if (this.languageClient == null)
				return;
			
			try {
				this.languageClient.close();
			} catch (Exception e) {
				LOG.warn(e.getMessage());
			}
		}

		@ProcessElement
		public void processElement(ProcessContext c) {
			ContentIndexSummary is = c.element();

			try {

				if (this.languageClient == null)
					throw new Exception("CNLP client not initialized");
				
				com.google.cloud.language.v1.Document doc = com.google.cloud.language.v1.Document.newBuilder()
					.setContent(is.doc.text).setType(Type.PLAIN_TEXT).build();

				AnalyzeEntitiesRequest request = AnalyzeEntitiesRequest.newBuilder()
					.setDocument(doc).setEncodingType(EncodingType.UTF16).build();

				AnalyzeEntitiesResponse response = languageClient.analyzeEntities(request);
				
				// get at most as many entities as we have tags in the Sirocco-based output
				int entitiesToGet = Math.min(is.doc.tags.length, response.getEntitiesList().size());
				DocumentTag[] newTags = new DocumentTag[entitiesToGet];
				
				// Create additional Document Tags and add them to the output index summary
				for (int idx = 0; idx < entitiesToGet; idx++) {
					// Entities are sorted by salience in the response list, so pick the first ones
					Entity entity = response.getEntitiesList().get(idx);
					DocumentTag dt = new DocumentTag();
					String tag = IndexerPipelineUtils.CNLP_TAG_PREFIX + entity.getName();
					Float weight = entity.getSalience();
					Boolean goodAsTopic = null;
					dt.initialize(tag, weight, goodAsTopic);
					newTags[idx] = dt;
				}
				
				if (entitiesToGet>0)
				{
					ContentIndexSummary iscopy = is.copy();
					DocumentTag[] combinedTags = new DocumentTag[newTags.length + iscopy.doc.tags.length];
					System.arraycopy(iscopy.doc.tags, 0, combinedTags, 0, iscopy.doc.tags.length);
					System.arraycopy(newTags, 0, combinedTags, iscopy.doc.tags.length, newTags.length);
					iscopy.doc.tags = combinedTags;
					c.output(iscopy);
				}
				else
					c.output(is);
				
			} catch (Exception e) {
				LOG.warn(e.getMessage());
			}

		}
	}	
	
	
	/**
	 * Splits incoming collection into A (main output) and B (side output)
	 * 
	 *
	 */
	static class SplitAB extends DoFn<ContentIndexSummary, ContentIndexSummary> {
		
		/**
		 * bRatio - Ratio of elements to route to "B" side output.
		 * Needs to be a float value between 0 and 1.
		 */
		private final Float bRatio;
		private final int threshold;
		private transient ThreadLocalRandom random;

		
		public SplitAB(Float bRatio) {
			this.bRatio = (bRatio < 0) ? 0: (bRatio < 1)? bRatio : 1; // valid values are between 0 and 1
			this.threshold = (int) (((float) Integer.MAX_VALUE) * this.bRatio);
		}
		
		@StartBundle
		public void startBundle() {
			random = ThreadLocalRandom.current();
		}
		
		@ProcessElement
		public void processElement(ProcessContext c) {
			ContentIndexSummary i = c.element();
			int dice = random.nextInt(Integer.MAX_VALUE);
			
			if (dice > this.threshold)
				c.output(i);
			else
				c.output(PipelineTags.BranchB, i);
		}
	}
	
	
	

}