/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ /** * The original code was copied from org.apache.uima.examples.cpe.FileSystemCollectionReader * and modified for Mayo use. * * A simple collection reader that reads documents from a directory * in the filesystem. It can be configured with the following parameters: * <ul> * <li><code>InputDirectory</code> - path to directory containing files</li> * <li><code>Encoding</code> (optional) - character encoding of the input * files</li> * <li><code>Language</code> (optional) - language of the input documents</li> * <li><code>Extensions</code> (optional) - Name of optional configuration * parameter that specifies the extensions of the files that the * collection reader will read. * </ul> * * TODO We may need to provide a way to specify some portion of the path of the file * to be included in the id of the document especially if we extend to recursively * gather files in the directory from sub directories. */ import org.apache.ctakes.core.config.ConfigParameterConstants; import org.apache.ctakes.core.pipeline.PipeBitInfo; import org.apache.ctakes.core.resource.FileLocator; import org.apache.ctakes.typesystem.type.structured.DocumentID; import org.apache.ctakes.typesystem.type.structured.Metadata; import org.apache.ctakes.typesystem.type.structured.SourceData; import org.apache.uima.cas.CAS; import org.apache.uima.cas.CASException; import org.apache.uima.collection.CollectionException; import org.apache.uima.collection.CollectionReader_ImplBase; import org.apache.uima.internal.util.XMLUtils; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceConfigurationException; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Level; import org.apache.uima.util.Progress; import org.apache.uima.util.ProgressImpl; import java.io.*; import java.sql.SQLException; import java.sql.Timestamp; import java.util.ArrayList; import java.util.List; import java.util.zip.GZIPInputStream; //import org.apache.uima.jcas.tcas.DocumentAnnotation; @PipeBitInfo( name = "Xml fixing Files in Dir Reader", description = "Reads document texts from text files in a directory, replacing illegal xml characters with spaces.", role = PipeBitInfo.Role.READER, products = { PipeBitInfo.TypeProduct.DOCUMENT_ID } ) public class XmlFixingFilesInDirectoryCollectionReader extends CollectionReader_ImplBase { /** * Name of configuration parameter that contains the character encoding used * by the input files. If not specified, the default system encoding will * be used. */ public static final String PARAM_ENCODING = "Encoding"; /** * Name of optional configuration parameter that contains the language of * the documents in the input directory. If specified this information will * be added to the CAS. */ public static final String PARAM_LANGUAGE = "Language"; /**Name of optional configuration parameter that specifies the extensions * of the files that the collection reader will read. Values for this * parameter should not begin with a dot <code>'.'</code>. */ public static final String PARAM_EXTENSIONS = "Extensions"; public static final String PARAM_RECURSE = "Recurse"; protected ArrayList<File> iv_files; private String iv_encoding; private String iv_language; private static String[] iv_extensions; protected int iv_currentIndex; private boolean iv_recurse = false; private String iv_rootPath = ""; private org.apache.uima.util.Logger logger = null; /** * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize() */ @Override public void initialize() throws ResourceInitializationException { logger = getUimaContext().getLogger(); final String inputDirPath = (String)getConfigParameterValue( ConfigParameterConstants.PARAM_INPUTDIR ); File directory; try { directory = FileLocator.locateFile( inputDirPath ); } catch ( IOException ioE ) { throw new ResourceInitializationException( ResourceConfigurationException.DIRECTORY_NOT_FOUND, new Object[] { ConfigParameterConstants.PARAM_INPUTDIR, getMetaData().getName(), inputDirPath } ); } iv_encoding = (String)getConfigParameterValue( PARAM_ENCODING ); iv_language = (String)getConfigParameterValue(PARAM_LANGUAGE); iv_extensions = (String[]) getConfigParameterValue(PARAM_EXTENSIONS); iv_currentIndex = 0; iv_recurse = false; Boolean recurse = (Boolean) getConfigParameterValue(PARAM_RECURSE); if(recurse != null) iv_recurse = recurse.booleanValue(); iv_rootPath = directory.getPath(); //if input directory does not exist or is not a directory, throw exception if (!directory.exists() || !directory.isDirectory()) { throw new ResourceInitializationException( ResourceConfigurationException.DIRECTORY_NOT_FOUND, new Object[] { ConfigParameterConstants.PARAM_INPUTDIR, this.getMetaData().getName(), directory.getPath() } ); } //get list of files (not subdirectories) in the specified directory iv_files = new ArrayList<File>(); if(!iv_recurse) { File[] files = directory.listFiles(); for (int i = 0; i < files.length; i++) { if (!files[i].isDirectory() && hasValidExtension(files[i])) { iv_files.add(files[i]); } } } else { try { collectFiles(directory, iv_files); System.out.println("iv_files.size()="+iv_files.size()); } catch(IOException ioe) { throw new ResourceInitializationException(ioe); } } } private void collectFiles(File directory, List<File> files) throws IOException { File[] dirFiles = directory.listFiles(); for(int i=0; i<dirFiles.length;i++) { if(dirFiles[i].isDirectory()) { collectFiles(dirFiles[i], files); } else if(hasValidExtension(dirFiles[i])) { files.add(dirFiles[i]); } } } private boolean hasValidExtension(File file) { if(iv_extensions == null) return true; for (int i = 0; i < iv_extensions.length; i++) { if(file.getName().endsWith("."+iv_extensions[i])) { return true; } } return false; } /** * @see org.apache.uima.collection.CollectionReader#hasNext() */ @Override public boolean hasNext() { return iv_currentIndex < iv_files.size(); } /** * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS) */ @Override public void getNext( CAS aCAS ) throws IOException, CollectionException { JCas jcas; InputStream fileInputStream = null; BufferedReader fileReader = null; try { jcas = aCAS.getJCas(); //open input stream to file File file = iv_files.get( iv_currentIndex ); if(file.getName().endsWith(".gz")){ fileInputStream = new GZIPInputStream(new FileInputStream(file)); }else { fileInputStream = new FileInputStream(file); } if(iv_encoding != null) { fileReader = new BufferedReader(new InputStreamReader(fileInputStream, iv_encoding)); }else{ fileReader = new BufferedReader(new InputStreamReader(fileInputStream)); } DocumentID documentIDAnnotation = new DocumentID(jcas); String docID = createDocID(file); documentIDAnnotation.setDocumentID(docID); documentIDAnnotation.addToIndexes(); StringBuilder buff = new StringBuilder(); String line; while((line = fileReader.readLine()) != null){ buff.append(line); } String text = buff.toString(); //put document in CAS (assume CAS) jcas.setDocumentText(forceXmlSerializable(text)); //set language if it was explicitly specified as a configuration parameter if (iv_language != null) { // ((DocumentAnnotation)jcas.getDocumentAnnotationFs()).setLanguage(iv_language); } } catch (CASException e) { throw new CollectionException(e); } finally { if (fileReader != null) fileReader.close(); iv_currentIndex++; } try { setMetadata( jcas ); } catch ( SQLException sqlE ) { // thrown by setMetaData(..) inner calls to ResultSet.get*(..) , rethrow as declared IOException throw new IOException( sqlE ); } } private String createDocID(File file) { String docID = file.getPath(); if(iv_rootPath.endsWith(""+File.separator) || iv_rootPath.equals("")) { docID = docID.substring(iv_rootPath.length()); } else docID = docID.substring(iv_rootPath.length()+1); return docID; } /** * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close() */ @Override public void close() throws IOException { } /** * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress() */ @Override public Progress[] getProgress() { return new Progress[]{ new ProgressImpl(iv_currentIndex, iv_files.size(),Progress.ENTITIES)}; } /** * Gets the total number of documents that will be returned by this * collection reader. This is not part of the general collection reader * interface. * * @return the number of documents in the collection */ public int getNumberOfDocuments() { return iv_files.size(); } private String forceXmlSerializable( String s ) { if (s==null) return ""; if (s.length()==0) return s; int badChar = XMLUtils.checkForNonXmlCharacters(s); // Performance-wise this is not the best but since this is not that common an occurrence, // it is good enough while (badChar > -1) { char c = s.charAt(badChar); s = s.replace(c, ' '); badChar = XMLUtils.checkForNonXmlCharacters(s); } return s; } private JCas setMetadata( final JCas jCas ) throws SQLException { final Metadata metadata = new Metadata( jCas ); final SourceData sourcedata = new SourceData( jCas ); metadata.setPatientID( 0L ); sourcedata.setAuthorSpecialty( "Unknown" ); sourcedata.setNoteTypeCode( "UnknownNoteType" ); sourcedata.setSourceEncounterId( -1L+"" ); sourcedata.setSourceInstanceId( -1L+"" ); sourcedata.setSourceOriginalDate( (new Timestamp(System.currentTimeMillis())).toString() ); metadata.setSourceData( sourcedata ); jCas.addFsToIndexes( metadata ); logger.log(Level.INFO, metadata.getPatientID() + " " + sourcedata.getSourceEncounterId() + " " + sourcedata.getSourceInstanceId()); return jCas; } }