package org.jumbune.datavalidation; import static org.jumbune.datavalidation.DataValidationConstants.SLAVE_FILE_LOC; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.Map; import java.util.Map.Entry; import java.util.Random; import java.util.Set; import java.util.TreeMap; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.MapWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Reducer; import org.jumbune.common.utils.Constants; import org.jumbune.datavalidation.ArrayListWritable; import org.jumbune.utils.JobUtil; /** * The Reducer takes <dataviolation type, Iterable<DataViolationWritableBean>> as input and writes a list of all data violation beans corresponding to * different data violation types. * * */ public class DataValidationReducer extends Reducer<Text, DataDiscrepanciesArrayWritable, Text, DataViolationWritable> { /** The dir path. */ private String dirPath; /** The file handler map. */ private Map<String, BufferedWriter> fileHandlerMap; /** The max violations in report. */ private int maxViolationsInReport; /** The offset lines map. * This map is used to keep track total number of lines processed against an offset which is the end offset of split * A TreeMap implementation is used further so as to keep the records sorted by end offset of split. * **/ private Map<FileOffsetKey, Long> offsetLinesMap; private Set<String> fileNames ; private MultiValueTreeMap<String, ViolationPersistenceBean> nullMap ; private MultiValueTreeMap<String, ViolationPersistenceBean> dataTypeMap ; private MultiValueTreeMap<String, ViolationPersistenceBean> regexMap ; private MultiValueTreeMap<String, ViolationPersistenceBean> numFieldsMap ; /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Reducer#setup(org.apache.hadoop.mapreduce.Reducer.Context) */ @SuppressWarnings({ "rawtypes", "unchecked" }) protected void setup(Reducer.Context context) throws IOException, InterruptedException { super.setup(context); maxViolationsInReport = context.getConfiguration().getInt(DataValidationConstants.DV_NUM_REPORT_VIOLATION, 1000); String dir = context.getConfiguration().get(SLAVE_FILE_LOC); dirPath = JobUtil.getAndReplaceHolders(dir); fileHandlerMap = new DVLRUCache(DataValidationConstants.TEN); offsetLinesMap = new TreeMap<>(); ViolationPersistenceBean bean = new ViolationPersistenceBean(); bean.setLineNum(Integer.MAX_VALUE); nullMap = new MultiValueTreeMap<String,ViolationPersistenceBean>(maxViolationsInReport); dataTypeMap = new MultiValueTreeMap<String,ViolationPersistenceBean>(maxViolationsInReport); regexMap = new MultiValueTreeMap<String,ViolationPersistenceBean>(maxViolationsInReport); numFieldsMap = new MultiValueTreeMap<String,ViolationPersistenceBean>(maxViolationsInReport); fileNames = new HashSet<String>(); } /** * reduce method takes <dataviolation type, Iterable<DataViolationWritableBean>> as input and writes a list of all data violation beans corresponding to * different data violation types. */ public void reduce(Text key, Iterable<DataDiscrepanciesArrayWritable> values, Context context) throws IOException, InterruptedException { String[] falseSplits = key.toString().split("DDAW"); if (falseSplits.length > 1) { key = new Text(falseSplits[0]); } createDirectory(key); IntWritable fieldNumber = new IntWritable(); IntWritable fieldViolations = new IntWritable(0); long totalFieldViolations = 0; long totalNullCheckViolations = 0; long totalDataTypeViolations = 0; long totalRegexCheckViolations = 0; MapWritable nullCheckMapWritable = null; MapWritable dataTypeCheckMapWritable = null; MapWritable regexCheckMapWritable = null; MapWritable fieldMapWritable = null; Map<String, Integer> nullCheckfileViolationsMap = null; Map<String, Integer> dataTypeFileViolationsMap = null; Map<String, Integer> regexCheckFileViolationsMap = null; Map<String, Integer> fieldFileViolationsMap = null; Set<String> dirtyFieldTupleSet = new HashSet<String>(); Set<String> dirtyDataTypeTupleSet = new HashSet<String>(); Set<String> dirtyRegexTupleSet = new HashSet<String>(); Set<String> dirtyNullCheckSet = new HashSet<String>(); for (DataDiscrepanciesArrayWritable dvarrayWritable : values) { for (Writable writable : dvarrayWritable.get()) { DataViolationWB dataViolationWB = (DataViolationWB) writable; if (dataViolationWB != null) { MapWritable fieldMap = dataViolationWB.getFieldMap(); if (fieldMap == null || fieldMap.isEmpty()) return; // maintaining offsetLinesMap & offsetFilesMap according to // split end offset. Long splitEndOffset = dataViolationWB.getSplitEndOffset().get(); long totalRecEmiByMap = dataViolationWB.getTotalRecordsEmittByMap().get(); String fileName = dataViolationWB.getFileName().toString(); FileOffsetKey fileOffsetKey = new FileOffsetKey(fileName, splitEndOffset); offsetLinesMap.put(fileOffsetKey, totalRecEmiByMap); for (Entry<Writable, Writable> entries : fieldMap.entrySet()) { FieldLWB fieldLWB = (FieldLWB) entries.getValue(); MapWritable typeVioMap = fieldLWB.getTypeViolationMap(); for (Entry<Writable, Writable> entry : typeVioMap.entrySet()) { ViolationLWB violationLWB = (ViolationLWB) entry.getValue(); ArrayListWritable<LineLWB> lineLWBs = violationLWB.getLineLWBList(); for (LineLWB lineLWB : lineLWBs) { ViolationPersistenceBean bean = null; switch (entry.getKey().toString()) { case DataValidationConstants.NUM_OF_FIELDS_CHECK: dirtyFieldTupleSet.add(fileName + lineLWB.getLineNumber()); totalFieldViolations++; if (fieldMapWritable == null || fieldFileViolationsMap == null) { fieldMapWritable = new MapWritable(); fieldFileViolationsMap = new LinkedHashMap<String, Integer>(); } bean = new ViolationPersistenceBean(((IntWritable) entries.getKey()).get(), lineLWB.getLineNumber().get(), violationLWB.getExpectedValue().toString(), lineLWB.getActualValue().toString(), entry.getKey().toString(), fileName, splitEndOffset); numFieldsMap.add(bean.getFileName(), bean); fileNames.add(bean.getFileName()); processTupleViolation(fieldMapWritable, fieldFileViolationsMap, entries.getKey(), fileName); break; case DataValidationConstants.USER_DEFINED_NULL_CHECK: dirtyNullCheckSet.add(fileName + lineLWB.getLineNumber()); totalNullCheckViolations++; if (nullCheckMapWritable == null || nullCheckfileViolationsMap == null) { nullCheckMapWritable = new MapWritable(); nullCheckfileViolationsMap = new LinkedHashMap<String, Integer>(); } bean = new ViolationPersistenceBean(((IntWritable) entries.getKey()).get(), lineLWB.getLineNumber().get(), violationLWB.getExpectedValue().toString(), lineLWB.getActualValue().toString(), entry.getKey().toString(), fileName, splitEndOffset); nullMap.add(bean.getFileName(), bean); fileNames.add(bean.getFileName()); processTupleViolation(nullCheckMapWritable, nullCheckfileViolationsMap, entries.getKey(), fileName); break; case DataValidationConstants.USER_DEFINED_DATA_TYPE: dirtyDataTypeTupleSet.add(fileName + lineLWB.getLineNumber()); totalDataTypeViolations++; if (dataTypeCheckMapWritable == null || dataTypeFileViolationsMap == null) { dataTypeCheckMapWritable = new MapWritable(); dataTypeFileViolationsMap = new LinkedHashMap<String, Integer>(); } bean = new ViolationPersistenceBean(((IntWritable) entries.getKey()).get(), lineLWB.getLineNumber().get(), violationLWB.getExpectedValue().toString(), lineLWB.getActualValue().toString(), entry.getKey().toString(), fileName, splitEndOffset); dataTypeMap.add(bean.getFileName(), bean); fileNames.add(bean.getFileName()); processTupleViolation(dataTypeCheckMapWritable, dataTypeFileViolationsMap, entries.getKey(), fileName); break; case DataValidationConstants.USER_DEFINED_REGEX_CHECK: dirtyRegexTupleSet.add(fileName + lineLWB.getLineNumber()); totalRegexCheckViolations++; if (regexCheckMapWritable == null || regexCheckFileViolationsMap == null) { regexCheckMapWritable = new MapWritable(); regexCheckFileViolationsMap = new LinkedHashMap<String, Integer>(); } bean = new ViolationPersistenceBean(((IntWritable) entries.getKey()).get(), lineLWB.getLineNumber().get(), violationLWB.getExpectedValue().toString(), lineLWB.getActualValue().toString(), entry.getKey().toString(), fileName, splitEndOffset); regexMap.add(bean.getFileName(), bean); fileNames.add(bean.getFileName()); processTupleViolation(regexCheckMapWritable, regexCheckFileViolationsMap, entries.getKey(), fileName); break; default: break; } } } } } } } for (BufferedWriter bw : fileHandlerMap.values()) { bw.close(); } fileHandlerMap.clear(); long dirtyTuple = 0; if (nullCheckfileViolationsMap != null) { dirtyTuple = dirtyNullCheckSet.size(); writeViolations(DataValidationConstants.USER_DEFINED_NULL_CHECK, context, totalNullCheckViolations, fieldNumber, fieldViolations, nullCheckMapWritable, nullCheckfileViolationsMap, dirtyTuple); } if (dataTypeFileViolationsMap != null) { dirtyTuple = dirtyDataTypeTupleSet.size(); writeViolations(DataValidationConstants.USER_DEFINED_DATA_TYPE, context, totalDataTypeViolations, fieldNumber, fieldViolations, dataTypeCheckMapWritable, dataTypeFileViolationsMap, dirtyTuple); } if (regexCheckFileViolationsMap != null) { dirtyTuple = dirtyRegexTupleSet.size(); writeViolations(DataValidationConstants.USER_DEFINED_REGEX_CHECK, context, totalRegexCheckViolations, fieldNumber, fieldViolations, regexCheckMapWritable, regexCheckFileViolationsMap, dirtyTuple); } if (fieldFileViolationsMap != null) { dirtyTuple = dirtyFieldTupleSet.size(); writeViolations(DataValidationConstants.NUM_OF_FIELDS_CHECK, context, totalFieldViolations, fieldNumber, fieldViolations, fieldMapWritable, fieldFileViolationsMap, dirtyTuple); } } /** * Calculate actual line no based on the total number of lines in each split. * * @param bean the bean * @return the int */ public int calculateActualLineNo(ViolationPersistenceBean bean) { long splitEndOff = bean.getSplitEndOffset(); long sum = 0; FileOffsetKey key ; //this fragment of code calculates the sum of all the values in the map // till the splitEndOff is encountered in the keys. for(Entry<FileOffsetKey, Long> entry: offsetLinesMap.entrySet()) { //verifying that the split belong to the same file //entry.getValue() == offsetLinesMap.get(bean.getSplitEndOffset()) key = entry.getKey(); //if(offsetFilesMap.get(entry.getKey()).equals(bean.getFileName())){ if(key.getFileName().equals(bean.getFileName())){ if(key.getOffset() == splitEndOff) { break; } else { sum += entry.getValue(); } } } return (int) (bean.getLineNum() + sum); } private void writeViolations(String violatoinType, Context context, long totalViolations, IntWritable fieldNumber, IntWritable fieldViolations, MapWritable mapWritable, Map<String, Integer> fileViolationsMap, long dirtyTuple) throws IOException, InterruptedException { ArrayListWritable<FileViolationsWritable> fileVioWriList = new ArrayListWritable<FileViolationsWritable>(); FileViolationsWritable fvWritable; for (Map.Entry<String, Integer> violationMap : fileViolationsMap.entrySet()) { fvWritable = new FileViolationsWritable(); fvWritable.setFileName(violationMap.getKey()); fvWritable.setNumOfViolations(violationMap.getValue()); fileVioWriList.add(fvWritable); } DataViolationWritable dataViolationWritable = new DataViolationWritable(); if (mapWritable == null) { mapWritable = new MapWritable(); mapWritable.put(fieldNumber, fieldViolations); } dataViolationWritable.setFieldMap(mapWritable); dataViolationWritable.setTotalViolations(totalViolations); dataViolationWritable.setFileViolationsWritables(fileVioWriList); dataViolationWritable.setDirtyTuple(dirtyTuple); context.write(new Text(violatoinType), dataViolationWritable); } private void processTupleViolation(MapWritable fieldMapWritable, Map<String, Integer> fieldFileViolationsMap, Writable fieldNo,String fileName) throws IOException { IntWritable fieldNumber = new IntWritable(); IntWritable fieldViolations = new IntWritable(0); int violations; fieldNumber = (IntWritable) fieldNo; fieldViolations = (IntWritable) fieldMapWritable.get((fieldNumber)); fieldViolations = setFieldViolations(fieldViolations); fieldMapWritable.put(fieldNumber, fieldViolations); violations = extractViolationsFromMap(fieldFileViolationsMap, fileName); violations += 1; fieldFileViolationsMap.put(fileName, violations); } private void createDirectory(Text key) { File f = new File(dirPath + File.separator + key.toString()); f.mkdirs(); f.setReadable(true, false); f.setWritable(true, false); } /** * Extract violations from map. * * @param fileViolationsMap the file violations map * @param fileName the file name * @return the int */ private int extractViolationsFromMap( Map<String, Integer> fileViolationsMap, String fileName) { int violations; if (fileViolationsMap.containsKey(fileName)) { violations = fileViolationsMap.get(fileName); } else { violations = 0; } return violations; } /** * Sets the field violations. * * @param fieldViolations the field violations * @return the int writable */ private IntWritable setFieldViolations(final IntWritable fieldViolations) { IntWritable fldViolations = fieldViolations; if (fldViolations != null) { fldViolations.set(fldViolations.get() + 1); } else { fldViolations = new IntWritable(1); } return fldViolations; } /** * Write violations to file. This method writes violations to respective files in corresponding directories(null, data type, regex, no. of field). * * @param fieldNumber the field number * @param lineNumber the line number * @param expectedValue the expected value * @param actualValue the actual value * @param violType the viol type * @param fileName the file name * @throws IOException Signals that an I/O exception has occurred. */ private void appendViolationToBuffer(StringBuffer stringBuffer, int fieldNumber, long lineNumber, String expectedValue, String actualValue) throws IOException { stringBuffer.append(lineNumber); stringBuffer.append(Constants.PIPE_SEPARATOR); if (fieldNumber == -1) { stringBuffer.append("-"); } else { stringBuffer.append(fieldNumber); } stringBuffer.append(Constants.PIPE_SEPARATOR).append(expectedValue).append(Constants.PIPE_SEPARATOR) .append(actualValue).append(System.lineSeparator()); } /** * Gets the file handler. * * @param fileName the file name * @param violatino type * @return the file handler * @throws IOException Signals that an I/O exception has occurred. */ private BufferedWriter getFileHandler(String fileName, String violationType) throws IOException { String absoluteFilePath = dirPath +File.separator+ violationType + File.separator+ fileName + "-" + new Random().nextInt() + "-" + System.nanoTime(); BufferedWriter out = fileHandlerMap.get(absoluteFilePath); if (out == null) { File f = new File(absoluteFilePath); f.setReadable(true, false); f.setWritable(true, false); out = new BufferedWriter(new FileWriter(f)); fileHandlerMap.put( absoluteFilePath, out); } return out; } /* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Reducer#cleanup(org.apache.hadoop.mapreduce.Reducer.Context) */ @SuppressWarnings({ "rawtypes", "unchecked" }) protected void cleanup(Reducer.Context context) throws IOException, InterruptedException { StringBuffer stringBuffer; for (String fileName : fileNames) { if(nullMap != null && !nullMap.isEmpty() && nullMap.getAllElements(fileName) != null){ stringBuffer = new StringBuffer(); for (ViolationPersistenceBean bean : nullMap.getAllElements(fileName)) { if(bean != null && bean.getViolationType() != null) { appendViolationToBuffer(stringBuffer, bean.getFieldNum(), bean.getLineNum(), bean.getExpectedValue(), bean.getActualValue()); } } BufferedWriter out = null; out = getFileHandler(fileName, DataValidationConstants.USER_DEFINED_NULL_CHECK); out.write(stringBuffer.toString()); out.flush(); } if(dataTypeMap !=null && !dataTypeMap.isEmpty() && dataTypeMap.getAllElements(fileName)!=null){ stringBuffer = new StringBuffer(); for (ViolationPersistenceBean bean : dataTypeMap.getAllElements(fileName)) { if(bean != null && bean.getViolationType() != null) { appendViolationToBuffer(stringBuffer, bean.getFieldNum(), bean.getLineNum(), bean.getExpectedValue(), bean.getActualValue()); } } BufferedWriter out = null; out = getFileHandler(fileName, DataValidationConstants.USER_DEFINED_DATA_TYPE); out.write(stringBuffer.toString()); out.flush(); } if(regexMap != null && !regexMap.isEmpty() && regexMap.getAllElements(fileName) != null){ stringBuffer = new StringBuffer(); for (ViolationPersistenceBean bean : regexMap.getAllElements(fileName)) { if(bean != null && bean.getViolationType() != null) { appendViolationToBuffer(stringBuffer, bean.getFieldNum(), bean.getLineNum(), bean.getExpectedValue(), bean.getActualValue()); } } BufferedWriter out = null; out = getFileHandler(fileName, DataValidationConstants.USER_DEFINED_REGEX_CHECK); out.write(stringBuffer.toString()); out.flush(); } if(numFieldsMap != null && !numFieldsMap.isEmpty() && numFieldsMap.getAllElements(fileName) != null){ stringBuffer = new StringBuffer(); for (ViolationPersistenceBean bean : numFieldsMap.getAllElements(fileName)) { if(bean != null && bean.getViolationType() != null) { appendViolationToBuffer(stringBuffer, bean.getFieldNum(), bean.getLineNum(), bean.getExpectedValue(), bean.getActualValue()); } } BufferedWriter out = null; out = getFileHandler(fileName, DataValidationConstants.NUM_OF_FIELDS_CHECK); out.write(stringBuffer.toString()); out.flush(); } } for (BufferedWriter bw : fileHandlerMap.values()) { bw.close(); } super.cleanup(context); } /** * The Class FileOffsetKey contains filename and the offset. */ private static class FileOffsetKey implements Comparable<FileOffsetKey> { Integer i; private String fileName; private long offset; FileOffsetKey(String fileName, long offset) { this.fileName = fileName; this.offset = offset; } public String getFileName() { return fileName; } public long getOffset() { return offset; } @Override public int compareTo(FileOffsetKey o) { int i = this.fileName.compareTo(o.getFileName()); if (i == 0) { if (this.offset == o.getOffset()) { i = 0; } else if (this.offset < o.getOffset()) { i = -1; } else if (this.offset > o.getOffset()) { i = 1; } } return i; } @Override public String toString() { return "Com [" + fileName + "," + offset + "]"; } } class ViolationPersistenceBean implements Comparable<ViolationPersistenceBean> { private int fieldNum; private long lineNum; private String expectedValue; private String actualValue; private String violationType; private String fileName; private long splitEndOffset; public ViolationPersistenceBean() { } public ViolationPersistenceBean(int fieldNum, long lineNum, String expectedValue, String actualValue, String violationType, String fileName, long splitEndOffset) { this.fieldNum = fieldNum; this.lineNum = lineNum; this.expectedValue = expectedValue; this.actualValue = actualValue; this.violationType = violationType; this.fileName = fileName; this.splitEndOffset = splitEndOffset; } public int getFieldNum() { return fieldNum; } public String getExpectedValue() { return expectedValue; } public String getActualValue() { return actualValue; } public String getViolationType() { return violationType; } public String getFileName() { return fileName; } public long getSplitEndOffset() { return splitEndOffset; } @Override public String toString() { return "ViolationPersistenceBean [fieldNum=" + fieldNum + ", lineNum=" + getLineNum() + ", expectedValue=" + expectedValue + ", actualValue=" + actualValue + ", violationType=" + violationType + ", fileName=" + fileName + ", splitEndOffset=" + splitEndOffset + "]"; } public long getLineNum() { return lineNum; } public void setLineNum(long lineNum) { this.lineNum = lineNum; } @Override public int compareTo(ViolationPersistenceBean otherViolations) { int compareResult = calculateActualLineNo(otherViolations)<(calculateActualLineNo(this))?1:calculateActualLineNo(otherViolations)>(calculateActualLineNo(this))?-1:0; return compareResult; } } }