/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.sdk.io.tika; import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument; import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState; import java.io.Serializable; import java.util.Arrays; import java.util.Objects; import javax.annotation.Nullable; import org.apache.beam.sdk.util.SerializableThrowable; import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.MoreObjects; import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Throwables; import org.apache.tika.metadata.Metadata; /** * The result of parsing a single file with Tika: contains the file's location, metadata, extracted * text, and optionally an error. If there is an error, the metadata and extracted text may be * partial (i.e. not represent the entire file). */ public class ParseResult implements Serializable { private final String fileLocation; private final String content; private final Metadata metadata; private final String[] metadataNames; @Nullable private final SerializableThrowable error; public static ParseResult success(String fileLocation, String content, Metadata metadata) { return new ParseResult(fileLocation, content, metadata, null); } public static ParseResult success(String fileLocation, String content) { return new ParseResult(fileLocation, content, new Metadata(), null); } public static ParseResult failure( String fileLocation, String partialContent, Metadata partialMetadata, Throwable error) { return new ParseResult(fileLocation, partialContent, partialMetadata, error); } private ParseResult(String fileLocation, String content, Metadata metadata, Throwable error) { checkArgument(fileLocation != null, "fileLocation can not be null"); checkArgument(content != null, "content can not be null"); checkArgument(metadata != null, "metadata can not be null"); this.fileLocation = fileLocation; this.content = content; this.metadata = metadata; this.metadataNames = metadata.names(); this.error = (error == null) ? null : new SerializableThrowable(error); } /** Returns the absolute path to the input file. */ public String getFileLocation() { return fileLocation; } /** Returns whether this file was parsed successfully. */ public boolean isSuccess() { return error == null; } /** Returns the parse error, if the file was parsed unsuccessfully. */ public Throwable getError() { checkState(error != null, "This is a successful ParseResult"); return error.getThrowable(); } /** * Same as {@link #getError}, but returns the complete stack trace of the error as a {@link * String}. */ public String getErrorAsString() { return Throwables.getStackTraceAsString(getError()); } /** Returns the extracted text. May be partial, if this parse result contains a failure. */ public String getContent() { return content; } /** Returns the extracted metadata. May be partial, if this parse result contains a failure. */ public Metadata getMetadata() { return metadata; } @Override public int hashCode() { return Objects.hash( getFileLocation(), getContent(), getMetadataHashCode(), isSuccess() ? "" : Throwables.getStackTraceAsString(getError())); } @Override public boolean equals(Object obj) { if (!(obj instanceof ParseResult)) { return false; } ParseResult other = (ParseResult) obj; return Objects.equals(getFileLocation(), other.getFileLocation()) && Objects.equals(getContent(), other.getContent()) && Objects.equals(getMetadata(), other.getMetadata()) && (isSuccess() ? other.isSuccess() : (!other.isSuccess() && Objects.equals(getErrorAsString(), other.getErrorAsString()))); } // TODO: Remove this function and use metadata.hashCode() once Apache Tika 1.17 gets released. private int getMetadataHashCode() { int hashCode = 0; for (String name : metadataNames) { hashCode += name.hashCode() ^ Arrays.hashCode(metadata.getValues(name)); } return hashCode; } @Override public String toString() { return MoreObjects.toStringHelper(this) .add("fileLocation", fileLocation) .add("content", "<" + content.length() + " chars>") .add("metadata", metadata) .add("error", getError() == null ? null : Throwables.getStackTraceAsString(getError())) .toString(); } }