java source code of JsonSerializationProcessor

package examples;

/*
 * #%L
 * Wikidata Toolkit Examples
 * %%
 * Copyright (C) 2014 - 2015 Wikidata Toolkit Developers
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
import org.wikidata.wdtk.datamodel.helpers.Datamodel;
import org.wikidata.wdtk.datamodel.helpers.DatamodelConverter;
import org.wikidata.wdtk.datamodel.helpers.DatamodelFilter;
import org.wikidata.wdtk.datamodel.helpers.JsonSerializer;
import org.wikidata.wdtk.datamodel.implementation.DataObjectFactoryImpl;
import org.wikidata.wdtk.datamodel.interfaces.*;

import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

/**
 * This example illustrates how to create a JSON serialization of some of the
 * data found in a dump. It uses a {@link DatamodelConverter} with filter
 * settings to eliminate some of the data.
 * <p>
 * As an example, the program only serializes data for people who were born in
 * Dresden, Germany. This can be changed by modifying the code in
 * {@link #includeDocument(ItemDocument)}.
 *
 * @author Markus Kroetzsch
 */
public class JsonSerializationProcessor implements EntityDocumentProcessor {

	static final String OUTPUT_FILE_NAME = "json-serialization-example.json.gz";

	final JsonSerializer jsonSerializer;

	final DatamodelFilter datamodelFilter;

	/**
	 * Runs the example program.
	 *
	 * @throws IOException if there was a problem in writing the output file
	 */
	public static void main(String[] args) throws IOException {
		ExampleHelpers.configureLogging();
		JsonSerializationProcessor.printDocumentation();

		JsonSerializationProcessor jsonSerializationProcessor = new JsonSerializationProcessor();
		ExampleHelpers
				.processEntitiesFromWikidataDump(jsonSerializationProcessor);
		jsonSerializationProcessor.close();
	}

	/**
	 * Constructor. Initializes various helper objects we use for the JSON
	 * serialization, and opens the file that we want to write to.
	 *
	 * @throws IOException if there is a problem opening the output file
	 */
	public JsonSerializationProcessor() throws IOException {
		// The filter is used to copy selected parts of the data. We use this
		// to remove some parts from the documents we serialize.
		DocumentDataFilter filter = new DocumentDataFilter();

		// Only copy English labels, descriptions, and aliases:
		filter.setLanguageFilter(Collections.singleton("en"));
		// Only copy statements of some properties:
		Set<PropertyIdValue> propertyFilter = new HashSet<>();
		propertyFilter.add(Datamodel.makeWikidataPropertyIdValue("P18")); // image
		propertyFilter.add(Datamodel.makeWikidataPropertyIdValue("P106")); // occupation
		propertyFilter.add(Datamodel.makeWikidataPropertyIdValue("P569")); // birthdate
		filter.setPropertyFilter(propertyFilter);
		// Do not copy any sitelinks:
		filter.setSiteLinkFilter(Collections.emptySet());

		this.datamodelFilter = new DatamodelFilter(new DataObjectFactoryImpl(), new DocumentDataFilter());

		// The (compressed) file we write to.
		OutputStream outputStream = new GzipCompressorOutputStream(
				new BufferedOutputStream(
						ExampleHelpers
								.openExampleFileOuputStream(OUTPUT_FILE_NAME)));
		this.jsonSerializer = new JsonSerializer(outputStream);

		this.jsonSerializer.open();
	}

	@Override
	public void processItemDocument(ItemDocument itemDocument) {
		if (includeDocument(itemDocument)) {
			this.jsonSerializer.processItemDocument(this.datamodelFilter.filter(itemDocument));
		}
	}

	@Override
	public void processPropertyDocument(PropertyDocument propertyDocument) {
		// we do not serialize any properties
	}

	/**
	 * Prints some basic documentation about this program.
	 */
	public static void printDocumentation() {
		System.out.println("********************************************************************");
		System.out.println("*** Wikidata Toolkit: JsonSerializationProcessor");
		System.out.println("*** ");
		System.out.println("*** This program will download and process dumps from Wikidata.");
		System.out.println("*** It will filter the data and store the results in a new JSON file.");
		System.out.println("*** See source code for further details.");
		System.out.println("********************************************************************");
	}

	/**
	 * Closes the output. Should be called after the JSON serialization was
	 * finished.
	 */
	public void close() {
		System.out.println("Serialized "
				+ this.jsonSerializer.getEntityDocumentCount()
				+ " item documents to JSON file " + OUTPUT_FILE_NAME + ".");
		this.jsonSerializer.close();
	}

	/**
	 * Returns true if the given document should be included in the
	 * serialization.
	 *
	 * @param itemDocument the document to check
	 * @return true if the document should be serialized
	 */
	private boolean includeDocument(ItemDocument itemDocument) {
		for (StatementGroup sg : itemDocument.getStatementGroups()) {
			// "P19" is "place of birth" on Wikidata
			if (!"P19".equals(sg.getProperty().getId())) {
				continue;
			}
			for (Statement s : sg.getStatements()) {
				if (s.getClaim().getMainSnak() instanceof ValueSnak) {
					Value v = ((ValueSnak) s.getClaim().getMainSnak())
							.getValue();
					// "Q1731" is "Dresden" on Wikidata
					if (v instanceof ItemIdValue
							&& "Q1731".equals(((ItemIdValue) v).getId())) {
						return true;
					}
				}
			}
		}
		return false;
	}
}