/* * FXDesktopSearch Copyright 2013 Mirko Sertic * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.mirkosertic.desktopsearch; import lombok.extern.slf4j.Slf4j; import org.apache.tika.Tika; import org.apache.tika.langdetect.OptimaizeLangDetector; import org.apache.tika.language.detect.LanguageDetector; import org.apache.tika.metadata.Metadata; import org.apache.tika.utils.DateUtils; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardOpenOption; import java.nio.file.attribute.BasicFileAttributes; import java.util.Calendar; import java.util.GregorianCalendar; import java.util.Locale; import java.util.regex.Pattern; @Slf4j class ContentExtractor { private final Tika tika; private final Pattern metaDataDatePattern; private final Configuration configuration; private final LanguageDetector languageDetector; public ContentExtractor(final Configuration aConfiguration) { // TODO: auch korrekt dieses Muster verarbeitrn : Mon Feb 18 15:55:10 CET 2013 metaDataDatePattern = Pattern.compile("(\\d{4})-(\\d{2})-(\\d{2})T(\\d{2}):(\\d{2}):(\\d{2})Z"); configuration = aConfiguration; tika = new Tika(); tika.setMaxStringLength(1024 * 1024 * 5); final var theDetector = new OptimaizeLangDetector(); try { theDetector.loadModels(); languageDetector = theDetector; } catch (final Exception e) { throw new RuntimeException(e); } } private String harmonizeMetaDataName(String aName) { final var p = aName.indexOf(":"); if (p>0) { aName = aName.substring(p+1); } final var theReplacement = configuration.getMetaDataNameReplacement().get(aName); if (theReplacement != null) { return theReplacement; } return aName; } public Content extractContentFrom(final Path aFile, final BasicFileAttributes aBasicFileAttributes) { try { final var theMetaData = new Metadata(); final String theStringData; // Files under 10 Meg are read into memory as a whole if (aBasicFileAttributes.size() < 1024 * 1024 * 4) { final var theData = Files.readAllBytes(aFile); theStringData = tika.parseToString(new ByteArrayInputStream(theData), theMetaData); } else { try (final var theStream = Files.newInputStream(aFile, StandardOpenOption.READ)) { theStringData = tika.parseToString(new BufferedInputStream(theStream), theMetaData) .replace('\n', ' ') .replace('\r', ' ') .replace('\t',' '); } } final var theLanguageResult = languageDetector.detect(theStringData); final var theFileTime = aBasicFileAttributes.lastModifiedTime(); var theLanguage = SupportedLanguage.getDefault(); try { theLanguage = SupportedLanguage.valueOf(theLanguageResult.getLanguage()); if (!configuration.getEnabledLanguages().contains(theLanguage)) { theLanguage = SupportedLanguage.getDefault(); } } catch (final Exception e) { log.info("Language {} was detected, but is not supported", theLanguageResult.getLanguage()); } final var theContent = new Content(aFile.toString(), theStringData, aBasicFileAttributes.size(), theFileTime.toMillis(), theLanguage); for (final var theName : theMetaData.names()) { final var theMetaDataValue = theMetaData.get(theName); // Try to detect if this is a date final var theMatcher = metaDataDatePattern.matcher(theMetaDataValue); if (theMatcher.find()) { final var theYear = Integer.parseInt(theMatcher.group(1)); final var theMonth = Integer.parseInt(theMatcher.group(2)); final var theDay = Integer.parseInt(theMatcher.group(3)); final var theHour = Integer.parseInt(theMatcher.group(4)); final var theMinute = Integer.parseInt(theMatcher.group(5)); final var theSecond = Integer.parseInt(theMatcher.group(6)); final var theCalendar = GregorianCalendar.getInstance(DateUtils.UTC, Locale.US); theCalendar.set(Calendar.YEAR, theYear); theCalendar.set(Calendar.MONTH, theMonth - 1); theCalendar.set(Calendar.DAY_OF_MONTH, theDay); theCalendar.set(Calendar.HOUR_OF_DAY, theHour); theCalendar.set(Calendar.MINUTE, theMinute); theCalendar.set(Calendar.SECOND, theSecond); theCalendar.set(Calendar.MILLISECOND, 0); theContent.addMetaData(harmonizeMetaDataName(theName.toLowerCase()), theCalendar.getTime()); } else { theContent.addMetaData(harmonizeMetaDataName(theName.toLowerCase()), theMetaData.get(theName)); } } final var theFileName = aFile.toString(); final var p = theFileName.lastIndexOf("."); if (p > 0) { final var theExtension = theFileName.substring(p + 1); theContent.addMetaData(IndexFields.EXTENSION, theExtension.toLowerCase()); } if (configuration.isNaturalLanguageProcessing()) { // Run natural language processing try { final NLP nlp = NLP.forLanguage(theLanguage); nlp.addMetaDataTo(theStringData, theContent); } catch (final Exception e) { log.warn("Error on NLP, document will still be indexed", e); } } return theContent; } catch (final Exception e) { log.error("Error extracting content of {}", aFile, e); } return null; } public boolean supportsFile(final String aFilename) { for (final var theType : configuration.getEnabledDocumentTypes()) { if (theType.supports(aFilename)) { return true; } } return false; } }