package com.dovetailsoftware.aws.lambda; import java.io.IOException; import java.io.InputStream; import java.io.ByteArrayInputStream; import java.io.StringWriter; import java.nio.charset.Charset; import java.net.URLDecoder; import java.util.Iterator; import javax.xml.transform.OutputKeys; import javax.xml.transform.TransformerFactory; import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.stream.StreamResult; import javax.xml.transform.TransformerConfigurationException; import org.xml.sax.SAXException; import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.Parser; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import com.amazonaws.services.lambda.runtime.Context; import com.amazonaws.services.lambda.runtime.LambdaLogger; import com.amazonaws.services.lambda.runtime.RequestHandler; import com.amazonaws.services.lambda.runtime.events.S3Event; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.AmazonS3Client; import com.amazonaws.services.s3.event.S3EventNotification.S3EventNotificationRecord; import com.amazonaws.services.s3.model.GetObjectRequest; import com.amazonaws.services.s3.model.PutObjectRequest; import com.amazonaws.services.s3.model.ObjectMetadata; import com.amazonaws.services.s3.model.S3Object; import org.json.simple.JSONObject; public class TikaLambdaHandler implements RequestHandler<S3Event, String> { private LambdaLogger _logger; public String handleRequest(S3Event s3event, Context context) { _logger = context.getLogger(); _logger.log("Received S3 Event: " + s3event.toJson()); try { S3EventNotificationRecord record = s3event.getRecords().get(0); String bucket = record.getS3().getBucket().getName(); String extractBucket = "extracts." + bucket; // Object key may have spaces or unicode non-ASCII characters. String key = URLDecoder.decode(record.getS3().getObject().getKey().replace('+', ' '), "UTF-8"); // Short-circuit ignore .extract files because they have already been extracted, this prevents an endless loop if (key.toLowerCase().endsWith(".extract")) { _logger.log("Ignoring extract file " + key); return "Ignored"; } AmazonS3 s3Client = new AmazonS3Client(); S3Object s3Object = s3Client.getObject(new GetObjectRequest(bucket, key)); try (InputStream objectData = s3Object.getObjectContent()) { String extractJson = doTikaStuff(bucket, key, objectData); byte[] extractBytes = extractJson.getBytes(Charset.forName("UTF-8")); int extractLength = extractBytes.length; ObjectMetadata metaData = new ObjectMetadata(); metaData.setContentLength(extractLength); _logger.log("Saving extract file to S3"); InputStream inputStream = new ByteArrayInputStream(extractBytes); s3Client.putObject(extractBucket, key + ".extract", inputStream, metaData); } } catch (IOException | TransformerConfigurationException | SAXException e) { _logger.log("Exception: " + e.getLocalizedMessage()); throw new RuntimeException(e); } return "Success"; } private String doTikaStuff(String bucket, String key, InputStream objectData) throws IOException, TransformerConfigurationException, SAXException { _logger.log("Extracting text with Tika"); String extractedText = ""; SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "text"); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); StringWriter sw = new StringWriter(); handler.setResult(new StreamResult(sw)); AutoDetectParser parser = new AutoDetectParser(); ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, parser); Tika tika = new Tika(); Metadata tikaMetadata = new Metadata(); try { // for synthetic transactions if( key.toLowerCase().endsWith("tika.exception.testing.pdf")) { throw new TikaException("Test Tika Exception"); } parser.parse(objectData, handler, tikaMetadata, parseContext); extractedText = sw.toString(); } catch( TikaException e) { _logger.log("TikaException thrown while parsing: " + e.getLocalizedMessage()); return assembleExceptionResult(bucket, key, e); } _logger.log("Tika parsing success"); return assembleExtractionResult(bucket, key, extractedText, tikaMetadata); } private String assembleExtractionResult(String bucket, String key, String extractedText, Metadata tikaMetadata) { JSONObject extractJson = new JSONObject(); String contentType = tikaMetadata.get("Content-Type"); contentType = contentType != null ? contentType : "content/unknown"; String contentLength = tikaMetadata.get("Content-Length"); contentLength = contentLength != null ? contentLength : "0"; extractJson.put("Exception", null); extractJson.put("FilePath", "s3://" + bucket + "/" + key); extractJson.put("Text", extractedText); extractJson.put("ContentType", contentType); extractJson.put("ContentLength", contentLength); JSONObject metadataJson = new JSONObject(); for( String name : tikaMetadata.names() ){ String[] elements = tikaMetadata.getValues(name); String joined = String.join(", ", elements); metadataJson.put(name, joined); } extractJson.put("Metadata", metadataJson); return extractJson.toJSONString(); } private String assembleExceptionResult(String bucket, String key, Exception e){ JSONObject exceptionJson = new JSONObject(); exceptionJson.put("Exception", e.getLocalizedMessage()); exceptionJson.put("FilePath", "s3://" + bucket + "/" + key); exceptionJson.put("ContentType", "unknown"); exceptionJson.put("ContentLength", "0"); exceptionJson.put("Text", ""); JSONObject metadataJson = new JSONObject(); metadataJson.put("resourceName", "s3://" + bucket + "/" + key); exceptionJson.put("Metadata", metadataJson); return exceptionJson.toJSONString(); } }