Java Code Examples for org.apache.tika.Tika#detect()

The following examples show how to use org.apache.tika.Tika#detect() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Base64Utils.java    From NutzSite with Apache License 2.0 6 votes vote down vote up
/**
     * 将图片文件转换成base64字符串,参数为该图片的路径
     *
     * @param file
     * @return java.lang.String
     */
    public static String fileBase64(File file) {
        try {
            // check content type of the file
            Tika tika = new Tika();
            String contentType =tika.detect(file);
            // read data as byte[]
            byte[] data = Files.readAllBytes(file.toPath());
            // convert byte[] to base64(java7)
            String base64str = DatatypeConverter.printBase64Binary(data);
            // convert byte[] to base64(java8)
//             String base64str = Base64.getEncoder().encodeToString(data);
            // cretate "data URI"
            StringBuilder sb = new StringBuilder();
            sb.append("data:");
            sb.append(contentType);
            sb.append(";base64,");
            sb.append(base64str);
            System.out.println(sb.toString());
            return sb.toString();

        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }
 
Example 2
Source File: ActionToHtml.java    From o2oa with GNU Affero General Public License v3.0 6 votes vote down vote up
ActionResult<Wo> execute(EffectivePerson effectivePerson, byte[] bytes, FormDataContentDisposition disposition)
		throws Exception {
	ActionResult<Wo> result = new ActionResult<>();
	Tika tika = new Tika();
	String type = tika.detect(bytes);
	Wo wo = new Wo();
	switch (type) {
	case ("application/msword"):
		wo.setValue(this.doc(bytes));
		break;
	case ("application/vnd.openxmlformats-officedocument.wordprocessingml.document"):
		wo.setValue(this.docx(bytes));
		break;
	default:
		throw new ExceptionUnsupportType(type);
	}
	result.setData(wo);
	return result;
}
 
Example 3
Source File: MyMimeTypeUtils.java    From spring-boot with Apache License 2.0 5 votes vote down vote up
/**
 * 利用 Tika 分析 Mime Type
 * 因为 Tika 要解析 File 、 URL 数据流,所以解析需要一定时间。不要用解析扩展名的方法,无法动态判断,不准。
 * <p>
 * Parses the given file and returns the extracted text content.
 *
 * @param file
 * @return
 */
public static String detect(File file) throws Exception {

    //文件不存在
    if (!file.exists()) {
        throw new Exception("exception ! " + file.getAbsoluteFile() + " not existes.");
    }
    Tika t = new Tika();
    return t.detect(file);

}
 
Example 4
Source File: MyMimeTypeUtils.java    From spring-boot with Apache License 2.0 5 votes vote down vote up
/**
 * 利用 Tika 分析 Mime Type
 * 因为 Tika 要解析 File 、 URL 数据流,所以解析需要一定时间。不要用解析扩展名的方法,无法动态判断,不准。
 * Parses the resource at the given URL and returns the extracted text content.
 *
 * @param url
 * @return
 */
public static String detect(URL url, int timeout) throws Exception {

    //网址不存在
    if (!MyUrlUtils.isURLAvailable(url, timeout)) {
        throw new Exception("exception ! " + url.getAuthority() + " not available");
    }
    Tika t = new Tika();
    return t.detect(url);
}
 
Example 5
Source File: MimeTypeUnitTest.java    From tutorials with MIT License 5 votes vote down vote up
/**
 * Test method demonstrating usage of Apache Tika.
 * 
 * @throws IOException
 */
@Test
public void whenUsingTika_thenSuccess() throws IOException {
    final File file = new File(FILE_LOC);
    final Tika tika = new Tika();
    final String mimeType = tika.detect(file);
    assertEquals(mimeType, PNG_EXT);
}
 
Example 6
Source File: DocumentTools.java    From o2oa with GNU Affero General Public License v3.0 4 votes vote down vote up
public static byte[] toPdf(String fileName, byte[] bytes, String stamp) throws Exception {

		Config.collect().validate();

		Tika tika = new Tika();
		String type = tika.detect(bytes, fileName);

		switch (Objects.toString(type, "")) {
		case MEDIATYPE_DOC:
			break;
		case MEDIATYPE_DOCX:
			break;
		default:
			throw new ExceptionUnsupportedMediaType(type);
		}

		URL serverUrl = new URL(Config.collect().url() + "/o2_collect_assemble/jaxrs/document/to/pdf");

		HttpURLConnection connection = (HttpURLConnection) serverUrl.openConnection();

		String boundary = "----" + StringTools.uniqueToken();

		connection.setRequestMethod("POST");
		connection.setDoOutput(true);
		connection.setUseCaches(false);
		connection.addRequestProperty("Content-Type", "multipart/form-data; boundary=" + boundary);

		try (OutputStream out = connection.getOutputStream();
				BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out))) {
			writer.write(twoHyphens + boundary);
			writer.write(CRLF);
			writer.write("Content-Disposition: form-data; name=\"file\"; filename=\""
					+ (StringUtils.isEmpty(fileName) ? StringTools.uniqueToken() : fileName) + "\"");
			writer.write(CRLF);
			writer.write("Content-Type: " + HttpMediaType.APPLICATION_OCTET_STREAM);
			writer.write(CRLF);
			writer.write(CRLF);
			writer.flush();
			out.write(bytes);
			out.flush();
			writer.write(CRLF);
			writer.write(twoHyphens + boundary);
			if (StringUtils.isNotEmpty(stamp)) {
				writer.write(CRLF);
				writer.write("Content-Disposition: form-data; name=\"stamp\"");
				writer.write(CRLF);
				writer.write("Content-Type: " + HttpMediaType.TEXT_PLAIN);
				writer.write(CRLF);
				writer.write(CRLF);
				writer.write(stamp);
				writer.write(CRLF);
				writer.write(twoHyphens + boundary);
			}
			writer.write(twoHyphens);
			writer.flush();
		}

		String respText = null;

		try (InputStream in = connection.getInputStream()) {
			respText = IOUtils.toString(in, DefaultCharset.charset_utf_8);
		}

		if (StringUtils.isNotEmpty(respText)) {
			ActionResponse response = XGsonBuilder.instance().fromJson(respText, ActionResponse.class);
			WrapString wrap = XGsonBuilder.instance().fromJson(response.getData(), WrapString.class);
			return Base64.decodeBase64(wrap.getValue());
		}
		return null;

	}
 
Example 7
Source File: DocumentTools.java    From o2oa with GNU Affero General Public License v3.0 4 votes vote down vote up
public static byte[] toImage(String fileName, byte[] bytes, String stamp, Integer page) throws Exception {

		Config.collect().validate();

		Tika tika = new Tika();
		String type = tika.detect(bytes, fileName);

		switch (Objects.toString(type, "")) {
		case MEDIATYPE_DOC:
			break;
		case MEDIATYPE_DOCX:
			break;
		default:
			throw new ExceptionUnsupportedMediaType(type);
		}

		URL serverUrl = new URL(Config.collect().url() + "/o2_collect_assemble/jaxrs/document/to/image");

		HttpURLConnection connection = (HttpURLConnection) serverUrl.openConnection();

		String boundary = "----" + StringTools.uniqueToken();

		connection.setRequestMethod("POST");
		connection.setDoOutput(true);
		connection.setUseCaches(false);
		connection.addRequestProperty("Content-Type", "multipart/form-data; boundary=" + boundary);

		try (OutputStream out = connection.getOutputStream();
				BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out))) {
			writer.write(twoHyphens + boundary);
			writer.write(CRLF);
			writer.write("Content-Disposition: form-data; name=\"file\"; filename=\""
					+ (StringUtils.isEmpty(fileName) ? StringTools.uniqueToken() : fileName) + "\"");
			writer.write(CRLF);
			writer.write("Content-Type: " + HttpMediaType.APPLICATION_OCTET_STREAM);
			writer.write(CRLF);
			writer.write(CRLF);
			writer.flush();
			out.write(bytes);
			out.flush();
			writer.write(CRLF);
			writer.write(twoHyphens + boundary);
			writer.write(CRLF);
			writer.write("Content-Disposition: form-data; name=\"page\"");
			writer.write(CRLF);
			writer.write("Content-Type: " + HttpMediaType.TEXT_PLAIN);
			writer.write(CRLF);
			writer.write(CRLF);
			writer.write("" + ((page == null || page < 0) ? 0 : page));
			writer.write(CRLF);
			writer.write(twoHyphens + boundary);
			if (StringUtils.isNotEmpty(stamp)) {
				writer.write(CRLF);
				writer.write("Content-Disposition: form-data; name=\"stamp\"");
				writer.write(CRLF);
				writer.write("Content-Type: " + HttpMediaType.TEXT_PLAIN);
				writer.write(CRLF);
				writer.write(CRLF);
				writer.write(stamp);
				writer.write(CRLF);
				writer.write(twoHyphens + boundary);
			}
			writer.write(twoHyphens);
			writer.flush();
		}

		String respText = null;

		try (InputStream in = connection.getInputStream()) {
			respText = IOUtils.toString(in, DefaultCharset.charset_utf_8);
		}

		if (StringUtils.isNotEmpty(respText)) {
			ActionResponse response = XGsonBuilder.instance().fromJson(respText, ActionResponse.class);
			WrapString wrap = XGsonBuilder.instance().fromJson(response.getData(), WrapString.class);
			return Base64.decodeBase64(wrap.getValue());
		}
		return null;

	}
 
Example 8
Source File: Helpers.java    From jobson with Apache License 2.0 4 votes vote down vote up
public static String getMimeType(InputStream s, String fileName) throws IOException {
    final Tika t = new Tika();
    return t.detect(s, fileName);
}
 
Example 9
Source File: DataResourceWorker.java    From scipio-erp with Apache License 2.0 4 votes vote down vote up
public static String getMimeTypeWithByteBuffer(java.nio.ByteBuffer buffer) throws IOException {
    byte[] b = buffer.array();

    Tika tika = new Tika();
    return tika.detect(b);
}
 
Example 10
Source File: ZipTextExtractor.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
public String extractText(InputStream input, String url, List<Outlink> outLinksList) throws IOException {
  String resultText = "";
  ZipInputStream zin = new ZipInputStream(input);
  ZipEntry entry;
  
  while ((entry = zin.getNextEntry()) != null) {
    
    if (!entry.isDirectory()) {
      int size = (int) entry.getSize();
      byte[] b = new byte[size];
      for(int x = 0; x < size; x++) {
        int err = zin.read();
        if(err != -1) {
          b[x] = (byte)err;
        }
      }
      String newurl = url + "/";
      String fname = entry.getName();
      newurl += fname;
      URL aURL = new URL(newurl);
      String base = aURL.toString();
      int i = fname.lastIndexOf('.');
      if (i != -1) {
        // Trying to resolve the Mime-Type
        Tika tika = new Tika();
        String contentType = tika.detect(fname);
        try {
          Metadata metadata = new Metadata();
          metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));
          metadata.set(Response.CONTENT_TYPE, contentType);
          Content content = new Content(newurl, base, b, contentType, metadata, this.conf);
          Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl());
          ParseData theParseData = parse.getData();
          Outlink[] theOutlinks = theParseData.getOutlinks();
          
          for(int count = 0; count < theOutlinks.length; count++) {
            outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
          }
          
          resultText += entry.getName() + " " + parse.getText() + " ";
        } catch (ParseException e) {
          if (LOG.isInfoEnabled()) { 
            LOG.info("fetch okay, but can't parse " + fname + ", reason: " + e.getMessage());
          }
        }
      }
    }
  }
  
  return resultText;
}
 
Example 11
Source File: TikaAnalysis.java    From tutorials with MIT License 4 votes vote down vote up
public static String detectDocTypeUsingFacade(InputStream stream) throws IOException {
    Tika tika = new Tika();
    String mediaType = tika.detect(stream);
    return mediaType;
}