Java Code Examples for org.apache.tika.metadata.Metadata#add()

The following examples show how to use org.apache.tika.metadata.Metadata#add() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ExecUtil.java    From ctsms with GNU Lesser General Public License v2.1 6 votes vote down vote up
public static String getMimeType(byte[] data, String fileName) throws Throwable {
	TikaInputStream tikaStream = null;
	Metadata metadata = new Metadata();
	metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName);
	try {
		tikaStream = TikaInputStream.get(data, metadata);
		return detector.detect(tikaStream, metadata).toString();
	} catch (Throwable t) {
		throw t;
	} finally {
		if (tikaStream != null) {
			try {
				tikaStream.close();
			} catch (IOException e) {
			}
		}
	}
}
 
Example 2
Source File: ExecUtil.java    From ctsms with GNU Lesser General Public License v2.1 6 votes vote down vote up
public static String getMimeType(File file) throws Throwable {
	TikaInputStream tikaStream = null;
	Metadata metadata = new Metadata();
	metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, file.getName());
	try {
		tikaStream = TikaInputStream.get(file, metadata);
		return detector.detect(tikaStream, metadata).toString();
	} catch (Throwable t) {
		throw t;
	} finally {
		if (tikaStream != null) {
			try {
				tikaStream.close();
			} catch (IOException e) {
			}
		}
	}
}
 
Example 3
Source File: TikaUtil.java    From scipio-erp with Apache License 2.0 6 votes vote down vote up
/**
 * Finds media type (through Apache Tika library), based on filename and magic numbers.
 * @throws IOException
 */
public static MediaType findMediaType(InputStream is, String fileName) throws IOException {
    BufferedInputStream bis = new BufferedInputStream(is);
    try {
        AutoDetectParser parser = new AutoDetectParser();
        Detector detector = parser.getDetector();
        Metadata md = new Metadata();
        md.add(Metadata.RESOURCE_NAME_KEY, fileName);
        MediaType mediaType = detector.detect(bis, md);
        return mediaType;
    } finally {
        try {
            bis.close();
        } catch (IOException e) {
            ;
        }
    }
}
 
Example 4
Source File: StoredFile.java    From openmeetings with Apache License 2.0 6 votes vote down vote up
private void init(String inName, String inExt, InputStream is) {
	if (Strings.isEmpty(inExt)) {
		int idx = inName.lastIndexOf('.');
		name = idx < 0 ? inName : inName.substring(0, idx);
		ext = getFileExt(inName);
	} else {
		name = inName;
		ext = inExt.toLowerCase(Locale.ROOT);
	}
	Metadata md = new Metadata();
	md.add(RESOURCE_NAME_KEY, String.format(FILE_NAME_FMT, name, ext));
	try {
		mime = tika.getDetector().detect(is == null ? null : TikaInputStream.get(is), md);
	} catch (Throwable e) {
		mime = null;
		log.error("Unexpected exception while detecting mime type", e);
	}
}
 
Example 5
Source File: NodeTika.java    From node-tika with MIT License 6 votes vote down vote up
public static String detectContentType(String uri) throws FileNotFoundException, IOException, TikaException {
	final Detector detector = config.getDetector();
	final TikaInputStream inputStream = createInputStream(uri);
	final Metadata metadata = new Metadata();

	// Set the file name. This provides some level of type-hinting.
	metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName());

	// Detect the content type.
	String contentType = detector.detect(inputStream, metadata).toString();

	inputStream.close();

	// Return the default content-type if undetermined.
	if (contentType == null || contentType.isEmpty()) {
		return MediaType.OCTET_STREAM.toString();
	}

	return contentType;
}
 
Example 6
Source File: FileParser.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
public static boolean isSupported(File file) throws FileNotFoundException, IOException
{
	FileInputStream fis = fileToInputStream(file);
	BufferedInputStream bif = new BufferedInputStream(fis);
	Metadata metadata = new Metadata();
	metadata.add(Metadata.RESOURCE_NAME_KEY, file.getName());
	boolean supported = isSupported(bif, metadata);
	bif.close();
	fis.close();
	return supported;
	
}
 
Example 7
Source File: FileParser.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
/**
 * 
 * @param file
 * @return Null if the file is not supported
 * @throws Exception 
 */
public static FileContent extractText(File file) throws Exception
{
	FileInputStream fis = fileToInputStream(file);
	BufferedInputStream bif = new BufferedInputStream(fis);
	Metadata metadata = new Metadata();
	metadata.add(Metadata.RESOURCE_NAME_KEY, file.getName());
	FileContent fileContent = extractText(bif, metadata);
	bif.close();
	fis.close();
	return fileContent;
}
 
Example 8
Source File: ParseResultTest.java    From beam with Apache License 2.0 5 votes vote down vote up
static Metadata getMetadata() {
  Metadata m = new Metadata();
  m.add("Author", "BeamTikaUser");
  m.add("Author", "BeamTikaUser2");
  m.add("Date", "2017-09-01");
  return m;
}
 
Example 9
Source File: TikaExtractor.java    From ache with Apache License 2.0 5 votes vote down vote up
private Metadata createMetadata(String fileName, String contentType) {
    Metadata metadata = new Metadata();
    if(fileName != null) {
        metadata.add(Metadata.RESOURCE_NAME_KEY, fileName);
    }
    if(contentType != null) {
        metadata.add(Metadata.CONTENT_TYPE, contentType);
    }
    return metadata;
}
 
Example 10
Source File: FetchedResultTest.java    From ache with Apache License 2.0 5 votes vote down vote up
/**
 * Test method for {@link crawlercommons.fetcher.FetchedResult#report()}.
 * This does not actually test anything but simply allows us to see what a
 * generated report would look like.
 * @throws UnsupportedEncodingException 
 */
@Test
public void testPrintReport() throws UnsupportedEncodingException {
    Metadata headerMetadata = new Metadata();
    headerMetadata.add(Metadata.CONTENT_DISPOSITION, "This is content disposition");
    headerMetadata.add(Metadata.CONTENT_ENCODING, "This is the encoding");
    headerMetadata.add(Metadata.CONTENT_LANGUAGE, "This is some language");
    headerMetadata.add(Metadata.CONTENT_LENGTH, "This is the length");

    Payload load = new Payload();
    load.put("Item 1", 1234);
    load.put("Item 2", 5678);
    load.put("Item 3", 1357);
    load.put("Item 4", 2468);

    FetchedResult result = new FetchedResult
    // (, , , headers, content, contentType, responseRate, payload,
    // newBaseUrl, numRedirects, hostAddress, statusCode, reasonPhrase)
    (
                    "http://en.wikipedia.org/wiki/Glasgow", // baseUrl
                    "http://en.wikipedia.org/wiki/Glasgow", // redirectedUrl
                    System.currentTimeMillis(), // fetchTime
                    headerMetadata, new String("Glasgow (/ˈɡlɑːzɡoʊ, ˈɡlæz-/;[4] Scots: Glesca; Scottish Gaelic: Glaschu) "
                                    + "is the largest city in Scotland, and the third largest in the United Kingdom.").getBytes("UTF-8"), "ScotsText", 2014, load, "http://en.wikipedia.org/wiki/Glasgow",
                    0, "wikipedia.org", 200, "");
    LOG.error(result.report());
}
 
Example 11
Source File: ExcelHtmlParser.java    From components with Apache License 2.0 5 votes vote down vote up
public static List<List<String>> getRows(InputStream rawContent, String encoding, long limit) {
  SimpleValuesContentHandler valuesContentHandler = new SimpleValuesContentHandler(-1, limit);

  HtmlParser htmlParser = new HtmlParser();
  Metadata metadata = new Metadata();
  metadata.add(Metadata.CONTENT_ENCODING, encoding);
  try {
    htmlParser.parse(rawContent, valuesContentHandler, metadata, new ParseContext());
  } catch (Exception e) {
    LOGGER.debug("Failed to parse the excel html format document.", e);
  }

  return valuesContentHandler.getValues();
}
 
Example 12
Source File: NodeTika.java    From node-tika with MIT License 5 votes vote down vote up
private static void fillMetadata(Metadata metadata, String contentType, String uri) {

		// Set the file name.
		if (uri != null) {
			metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName());
		}

		// Normalise the content-type.
		contentType = normalizeContentType(contentType);

		// Set the content-type.
		if (contentType != null) {
			metadata.add(HttpHeaders.CONTENT_TYPE, contentType);
		}
	}
 
Example 13
Source File: NodeTika.java    From node-tika with MIT License 5 votes vote down vote up
public static String detectContentTypeAndCharset(String uri) throws FileNotFoundException, IOException, TikaException {
	final Detector detector = config.getDetector();
	final TikaInputStream inputStream = createInputStream(uri);
	final Metadata metadata = new Metadata();

	// Set the file name. This provides some level of type-hinting.
	metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName());

	// Detect the content type.
	String contentType = detector.detect(inputStream, metadata).toString();

	// Use metadata to provide type-hinting to the AutoDetectReader.
	fillMetadata(metadata, contentType, uri);

	// Detect the character set.
	final AutoDetectReader reader = new AutoDetectReader(inputStream, metadata);
	String charset = reader.getCharset().toString();

	inputStream.close();

	// Return the default content-type if undetermined.
	if (contentType == null || contentType.isEmpty()) {
		return MediaType.OCTET_STREAM.toString();
	}

	// Append the charset if the content-type was determined.
	if (charset != null && !charset.isEmpty()) {
		return contentType + "; charset=" + charset;
	}

	return contentType;
}
 
Example 14
Source File: TikaDetector.java    From spring-boot-email-tools with Apache License 2.0 4 votes vote down vote up
private org.springframework.http.MediaType detect(final TikaInputStream stream, final String fileName) throws IOException {
    final Metadata metadata = new Metadata();
    metadata.add(RESOURCE_NAME_KEY, fileName);
    final MediaType mediaType = detector.detect(requireNonNull(stream), metadata);
    return toSpringMediaType(mediaType);
}