java source code of IngestHTRIntoAbbyyXML

package eu.transkribus.core.io.formats;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.StringUtils;
import org.eclipse.persistence.tools.file.FileUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.*;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import com.google.common.io.Files;
import com.itextpdf.text.Image;

import eu.transkribus.core.io.LocalDocReader;
import eu.transkribus.core.model.beans.pagecontent.PcGtsType;
import eu.transkribus.core.util.JaxbUtils;
import eu.transkribus.core.util.PageXmlUtils;
import eu.transkribus.core.util.XmlUtils;
import eu.transkribus.interfaces.types.util.ImageUtils;

import java.awt.Dimension;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.lang.reflect.Array;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;

import javax.imageio.ImageIO;
import javax.xml.bind.JAXBException;
import javax.xml.namespace.NamespaceContext;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

public class IngestHTRIntoAbbyyXML {
	private static final Logger logger = LoggerFactory.getLogger(IngestHTRIntoAbbyyXML.class);
	
	private static boolean createFinalPageXml = false;

	public static void main(String[] args) throws IOException {	
		
		if (createFinalPageXml){
			
			String imgDir = args[2];
			
			File imgFileDir = new File(imgDir);
			File convertedFileDir = new File(imgDir +"/convertedAbbyy/" );
			
			if(!convertedFileDir.exists()){
				logger.error("'convertedAbbyy' directoriy does not exist: " + args[2]);
				return;
			}
			
			File[] ocrFiles = convertedFileDir.listFiles();
			File[] imgFiles = imgFileDir.listFiles(new FileFilter() {
			    @Override
			    public boolean accept(File pathname) {
			        return pathname.isFile();
			    }
			});
			
			String resultDir = imgFileDir.getAbsolutePath()+"/convertedPage/";
			new File(resultDir).mkdirs();
			
			for (File abbyyXml : ocrFiles){
			
				if  (abbyyXml != null){
				
					BufferedImage img = null;
					
					String imgFn = abbyyXml != null ? imgDir + File.separator +  StringUtils.substringBefore(abbyyXml.getName(),".xml") + ".jpg" : "";
					logger.debug("imgFn " + imgFn);
					
					File imgFile = new File(imgFn);
					if (imgFile.exists()){
		
						try 
						{
						    img = ImageIO.read(imgFile.getAbsoluteFile());
							File pageOutFile = new File(resultDir+abbyyXml.getName());
							
							if (pageOutFile.exists()){
								continue;
							}
			
						    createPageXml(pageOutFile, true, abbyyXml, true, true, false, imgFile.getName(), new Dimension(img.getWidth(), img.getHeight()));
						} 
						catch (IOException e) 
						{
						    e.printStackTrace();
						}
					}
				}
			}
			
		}
		else{
		
			//Y:\HTR\Wiener_Adressbuch\done\htr_in Y:\HTR\Wiener_Adressbuch\done\ocr
			if(args.length != 2){
				usage();
				return;
			}
			
			final String htrDir, ocrDir;
	
			htrDir = args[0];
			ocrDir = args[1];
					
			File htrMainDir = new File(htrDir);
			File ocrMainDir = new File(ocrDir);
			
			ingestHtrIntoAbbyyForAllFolders(ocrMainDir, htrMainDir);
		
		}


	}
	
	private static void ingestHtrIntoAbbyyForAllFolders(File ocrDir, File htrAndImgDir) throws IOException {
		if(!ocrDir.exists() || !htrAndImgDir.exists()){
			logger.error("One of the start directories does not exist: " + ocrDir + " or " + htrAndImgDir);
			usage();
			return;
		}
		
		if (htrAndImgDir.listFiles().length != ocrDir.listFiles().length){
			logger.error("Directories must contain the same number of folders!");
			usage();
			return;
		}
		
			
		int countAllConverted = 0;
		for (File imgFileDir : htrAndImgDir.listFiles()){
			File ocrFileDir = new File(ocrDir.getAbsolutePath() + File.separator + imgFileDir.getName() + "/ocr");
			logger.debug("ocr folder: " + ocrFileDir.getAbsolutePath());
			File[] ocrFiles = ocrFileDir.listFiles();
			
			File htrFileDir = new File(imgFileDir.getAbsolutePath() + "/page");
			logger.debug("htr folder: " + htrFileDir.getAbsolutePath());
			File[] htrFiles = htrFileDir.listFiles();

			logger.debug("img folder: " + imgFileDir.getAbsolutePath());
			File[] imgFiles = imgFileDir.listFiles(new FileFilter() {
			    @Override
			    public boolean accept(File pathname) {
			        return pathname.isFile();
			    }
			});
						
			File resultDir = new File(htrAndImgDir.getParentFile().getAbsolutePath() + "/converted" + File.separator + imgFileDir.getName() + File.separator + "ocr");
//			if (resultDir.exists()){
//				logger.debug(resultDir.getAbsolutePath() + " Result dir exists already - try next one");
//				continue;
//			}
			
			resultDir.mkdirs();
			
			logger.debug("resultDir folder: " + resultDir.getAbsolutePath());
			
			File sampleDir = new File(htrAndImgDir.getParentFile().getAbsolutePath() + "/samples" + File.separator);
			sampleDir.mkdirs();
			
			logger.debug("sampleDir folder: " + sampleDir.getAbsolutePath());
						
			if (htrFiles.length != ocrFiles.length){
				logger.error("Directories must contain the same number of files!");
				usage();
				return;
			}
	
			Arrays.sort(htrFiles, (a, b) -> a.getName().compareToIgnoreCase(b.getName()));
			Arrays.sort(ocrFiles, (a, b) -> a.getName().compareToIgnoreCase(b.getName()));
			Arrays.sort(imgFiles, (a, b) -> a.getName().compareToIgnoreCase(b.getName()));
//			
//			displayFiles(htrFiles);
//			displayFiles(ocrFiles);
//			displayFiles(imgFiles);
						
			int nr = 0;
			for (File htr : htrFiles){

//				String ocrName = findOcrFilename(htr);
//				String ocrFn = ocrName != null ? ocrFileDir + File.separator + ocrName + ".xml" : "";
				
				String ocrFn = ocrFileDir + File.separator + FilenameUtils.getBaseName(htr.getName())+ ".xml";
				logger.debug("ocrFn " + ocrFn);
				
				File ocrFile = new File(ocrFn);
				if (ocrFile.exists()){
//					logger.debug("htr file: " + htr.getName());
//					logger.debug("ocr file: " + ocrFile.getName());
//					
//					System.in.read();
					
					File convertedFile = new File(resultDir.getAbsolutePath()+File.separator+htr.getName());
					
					if (convertedFile.exists()){
						logger.debug("Already converted (" + countAllConverted++ + ")");
						nr++;
						continue;
					}
					
					//ingestHTRIntoOCR(htr, ocrFiles[nr++]);
					File abbyyXml = combineHTRAndOCR(htr, ocrFile, convertedFile);
					
					//take test sample for every 500th page
					if  (abbyyXml != null && nr % 500 == 400){
						
						String resultPageDir = sampleDir.getAbsolutePath()+"/page/";
						new File(resultPageDir).mkdirs();
						
						File pageOutFile = new File(resultPageDir+htr.getName());
						
						BufferedImage img = null;

						try 
						{
						    img = ImageIO.read(imgFiles[nr].getAbsoluteFile());

						    FileUtils.copyFileToDirectory(imgFiles[nr], sampleDir);
						    createPageXml(pageOutFile, true, abbyyXml, true, true, false, imgFiles[nr].getName(), new Dimension(img.getWidth(), img.getHeight()));
						} 
						catch (IOException e) 
						{
						    e.printStackTrace();
						}
					}
					nr++;
				}
				else{
					logger.error("No ocr File found :(");
				}
			}
			
		}

		

		
	}

	private static void startSimpleTestWithFilenames(){
	    // read from files
	    String htrFilename= "C:/01_Projekte/READ/Projekte/Lehmann/Konvertierung/page/00000101.xml";
	    String ocrFilename = "C:/01_Projekte/READ/Projekte/Lehmann/Konvertierung/ocr/3697706.xml";
	    
	    ingestHTRIntoOCR(new File(htrFilename), new File(ocrFilename));
	}
	
	private static String findOcrFilename(File htrFile){
		try {
			FileInputStream fileIsHTR = new FileInputStream(htrFile);
				
			DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance();
			DocumentBuilder builder = builderFactory.newDocumentBuilder();
			Document htrDocument = builder.parse(fileIsHTR);			
			
			XPath xPath = XPathFactory.newInstance().newXPath();
			String expressionMd = "//TranskribusMetadata";
			
			NodeList nodeListHTR = (NodeList) xPath.compile(expressionMd).evaluate(htrDocument, XPathConstants.NODESET);
			
			System.out.println("nodeList length " + nodeListHTR.getLength());
			
			for (int i = 0; i < nodeListHTR.getLength(); i++){
				
				NamedNodeMap htrLineString = nodeListHTR.item(i).getAttributes();
    			Node pageId = nodeListHTR.item(i).getAttributes().getNamedItem("pageId");
    			if (pageId != null){
    				return pageId.getTextContent();
    			}
			}
		
		} catch (XPathExpressionException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (ParserConfigurationException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (SAXException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return null;
	}
	
	
	private static void ingestHTRIntoOCR(File htrFile, File ocrFile) {
		try {
			
			FileInputStream fileIsHTR = new FileInputStream(htrFile);
			FileInputStream fileIsOCR = new FileInputStream(ocrFile);
				
			DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance();
			DocumentBuilder builder = builderFactory.newDocumentBuilder();
			Document htrDocument = builder.parse(fileIsHTR);
			Document ocrDocument = builder.parse(fileIsOCR);			
			
			XPath xPath = XPathFactory.newInstance().newXPath();
			String expressionHTR = "//TextLine/TextEquiv/Unicode";
			String expressionOCR = "//line";
			//xPath: get all charParams for current node
			String expAllCharParamsOfLine = ".//charParams";
			
			
			NodeList nodeListOCR = (NodeList) xPath.compile(expressionOCR).evaluate(ocrDocument, XPathConstants.NODESET);
			NodeList nodeListHTR = (NodeList) xPath.compile(expressionHTR).evaluate(htrDocument, XPathConstants.NODESET);
			
			int countEqualLines = 0;
			int countUnequalLines = 0;
			int countEqualLineLengths = 0;
			int countUnequalLinesLenths = 0;
			
//			String test = "test";
//			for (char c : test.toCharArray()){
//				System.out.println(c);
//				
//			}
			
			//System.out.println("result is " + result);
			//		// and iterate on links
			System.out.println("nodeList length " + nodeListHTR.getLength());
			for (int i = 0; i < nodeListHTR.getLength(); i++){
				
				String htrLineString = nodeListHTR.item(i).getTextContent();
				
//				NodeList formatting = nodeListOCR.item(i).getChildNodes();

//				for (int j = 0; j<formatting.getLength(); j++){
//					//NodeList charParams = formatting.item(j).getChildNodes();
//					
//					logger.debug("node " + formatting.item(j));
				
					Node ocrLine = (Node) nodeListOCR.item(i);
					Node formatting = ocrLine.getLastChild();
					
					//logger.debug(ocrLine.getLastChild().getNodeName());
					
					String left = "", right = "", top = "", bottom = "";
							
		        	NamedNodeMap ocrLineAttributes = ocrLine.getAttributes();
		        	for (int l = 0; l<ocrLineAttributes.getLength(); l++){
		        		Node currAttr = ocrLineAttributes.item(l);
		        		//logger.debug("node map name " + currAttr.getNodeName());
		        		switch (currAttr.getNodeName()){
			        		case "l": left=currAttr.getNodeValue();
			        		case "r": right=currAttr.getNodeValue();
			        		case "t": top=currAttr.getNodeValue();
			        		case "b": bottom=currAttr.getNodeValue();
			        		default: break;
		        		}
		        	}
		        	
		        	int lineLeft = Integer.valueOf(left);
		        	int lineRight = Integer.valueOf(right);
		        	int lineTop = Integer.valueOf(top);
		        	int lineBottom = Integer.valueOf(bottom);
				
					NodeList nlCharParams = (NodeList) xPath.compile(expAllCharParamsOfLine).evaluate((Node) nodeListOCR.item(i), XPathConstants.NODESET);
//					logger.debug("ocr length: " + nlCharParams.getLength());
//					logger.debug("htr length: " + htrLineString.length());
					
					boolean differentLength = nlCharParams.getLength() != htrLineString.length();					
					float newCharWidth = ( (Integer.valueOf(right)-Integer.valueOf(left))/htrLineString.length());
					
					String ocrLineString = "";
					for (int k = 0; k<nlCharParams.getLength(); k++){
						ocrLineString += nlCharParams.item(k).getTextContent();
					}
					
//					logger.debug("htr line text: " + htrLineString);
//					logger.debug("ocr line text: " + ocrLineString);
					
					if (htrLineString.length() == ocrLineString.length()){
						countEqualLineLengths++;
					}
					else{
						countUnequalLinesLenths++;
					}
					
					if (htrLineString.equals(ocrLineString)){
//						logger.debug("ocr and htr are the same - keep charparams as they are!!");
						countEqualLines++;	
						continue;
					}
					
					else{
						countUnequalLines++;
						//logger.debug("go on");
					
					}
					
//					if (ocrLineString.length() > htrLineString.length()){
//						System.in.read();
//					}
					
					for (int k = 0; k<nlCharParams.getLength(); k++){
						
						Node charParamInOCR = nlCharParams.item(k);
						
			        	Character ocrChar = nlCharParams.item(k).getTextContent().charAt(0);
			        	Character htrChar = k<htrLineString.length()? htrLineString.charAt(k) : null;

//			        	NamedNodeMap nodeMap = charParamInOCR.getAttributes();
//			        	for (int l = 0; l<nodeMap.getLength(); l++){
//			        		logger.debug("node map name " + nodeMap.item(l).getNodeName());
//			        		logger.debug("node map " + nodeMap.item(l).getNodeValue());
//			        	}
						
						/*
						 * cases to be considered
						 * (1) ocr line shorter than htr -> add charparams for all additional chars
						 * (2) htr line shorter than ocr -> delete needless chars
						 * (3) equal size -> only replace text content for unequal chars
						 * (4) empty char in htr text but not in ocr
						 */
			        	
			        	//case 2: pointless ocr charParams need to be deleted
			        	if(htrChar == null){
//			        		logger.debug("remove child: " + charParamInOCR.getTextContent());
			        		
			        		//text node of this element needs to be removed as well - otherwise you get empty lines in xml
			                Node prevElem = charParamInOCR.getPreviousSibling();
			                if (prevElem != null && 
			                    prevElem .getNodeType() == Node.TEXT_NODE &&
			                    prevElem .getNodeValue().trim().length() == 0) {
			                	charParamInOCR.getParentNode().removeChild(prevElem);
			                }
			        		//System.in.read();
			        		charParamInOCR.getParentNode().removeChild(charParamInOCR);
			        		
			        		continue;
			        	}
			        	
			        	//if chars differ: replace ocr with htr (includes case 3, valid for case 1 and 2 too)
			        	if (ocrChar != htrChar){
			        		
			        		//if we deal with different length of htr and ocr we newly calculate the coordinates of the bounding box of each character 
			        		if (differentLength){
			        			int l = (int) (lineLeft+newCharWidth*k);
			        			int r = (int) (lineLeft+newCharWidth*(k+1));
			        			
			        			Node leftCoord = charParamInOCR.getAttributes().getNamedItem("l");
			        			leftCoord.setNodeValue(Integer.toString(l));
			        			
			        			Node rightCoord = charParamInOCR.getAttributes().getNamedItem("r");
			        			rightCoord.setNodeValue(Integer.toString(r));
			        		}
			        		
			        		//empty space - insert and add wordStart = 1;
			        		if (Character.isSpaceChar(htrChar)){//.equals("\u0020")){
			        			if (charParamInOCR.getNextSibling() != null && charParamInOCR.getNextSibling().getNextSibling() != null){
				        			Node wordStart = charParamInOCR.getNextSibling().getNextSibling().getAttributes().getNamedItem("wordStart");
				        			if (wordStart != null){
				        				wordStart.setNodeValue("1");
				        			}
			        			}
			        		}
			        		//set wordStart=0
			        		else if (Character.isSpaceChar(ocrChar)){
			        			if (charParamInOCR.getNextSibling() != null && charParamInOCR.getNextSibling().getNextSibling() != null){
			        				logger.debug(charParamInOCR.getNextSibling().getNextSibling().getNodeName());
				        			Node wordStart = charParamInOCR.getNextSibling().getNextSibling().getAttributes().getNamedItem("wordStart");
				        			if (wordStart != null){
				        				wordStart.setNodeValue("0");
				        			}
			        			}
			        		}

			        		deleteAllObsoleteAtttributes(charParamInOCR);
			        		
			        		
			    		   //Element additionalCharParam = ocrDocument.createElement("charParams");
			        		
			        		//test if this is sufficient
			        		
			        		charParamInOCR.setTextContent(htrChar.toString());
 			        		
			    		   //charParamInOCR.getParentNode().replaceChild(charParamInOCR, charParamInOCR);
			        	}
			        	
						
						//nlCharParams.item(k).getParentNode().replaceChild(newChild, oldChild)
//						logger.debug("node name: " + nlCharParams.item(k).getNodeName());
//						if (nlCharParams.item(k).getNodeName().equals("charParams"))
//							logger.debug("charParams: " + nlCharParams.item(k).getTextContent());
//						if (k<htrLineString.length())
//							logger.debug("vs. HTR char " + htrLineString.charAt(k));
					}
					
					//case 1:
					int remaining = htrLineString.length()-ocrLineString.length();
					if (remaining > 0){
						//logger.debug("append node to line " + ocrLine.getTextContent());
						for (int j = ocrLineString.length(); j<htrLineString.length(); j++){
							
							/*
							 * calculate the left and right coordinates from lineWidth/numberOfHTRChars
							 * top and bottom are taken from line
							 */
							
		        			int l = (int) (lineLeft+newCharWidth*j);
		        			int r = (int) (lineLeft+newCharWidth*(j+1));
		        			
							Character htrChar = htrLineString.charAt(j);
							Element additionalCharParam = ocrDocument.createElement("charParams");
							additionalCharParam.setAttribute("l", Integer.toString(l));
							additionalCharParam.setAttribute("t", top);
							additionalCharParam.setAttribute("r", Integer.toString(r));
							additionalCharParam.setAttribute("b", bottom);
			    		   			    		   
							additionalCharParam.appendChild(ocrDocument.createTextNode(htrChar.toString()));
							
							Node node = formatting.appendChild(additionalCharParam);
//							logger.debug("node added " + node.getTextContent());
							//System.in.read();
						}
						
					}
				//}
				
//				for (char c : currLineString.toCharArray()){
//					
//				}
				//logger.debug("ocr line length: " + nodeListOCR.item(i).getTextContent());
				//logger.debug("child nodes of line: " + nodeListOCR.item(i).getFirstChild().getTextContent());
				//logger.debug("ocr line length: " + nodeListOCR.item(i).getTextContent().length());
			}
			
	       TransformerFactory tff  = TransformerFactory.newInstance();
	       Transformer transformer = tff.newTransformer();
	       	      
	       DOMSource xmlSource = new DOMSource(ocrDocument);
	       StreamResult outputTarget = new StreamResult(new File(ocrFile.getParentFile().getAbsolutePath() + File.separatorChar + htrFile.getName()));
	       
	       transformer.transform(xmlSource, outputTarget);
			
			logger.debug("Statistics of file " + htrFile.getAbsolutePath());
			logger.debug("countEqualLines: "+countEqualLines);
			logger.debug("countUnequalLines: " +countUnequalLines);
			logger.debug("countEqualLineLengths: " +countEqualLineLengths);
			logger.debug("countUnequalLinesLenths: " + countUnequalLinesLenths);
						
			
//			System.out.println("nodeListOCR length " + nodeListOCR.getLength());
//			for (int i = 0; i < nodeListOCR.getLength(); i++){
//				logger.debug("node value: " + nodeListOCR.item(i).getTextContent().length());
//			}
			

//		    Document doc1 = expandedData1.getOwnerDocument();
//		    // insert the nodes
//		    Node expandedData2 = (Node) xpath.evaluate("//expandedData", ocr, NODE);
//		    expandedData1.getParentNode()
//		        .replaceChild(doc1.adoptNode(expandedData2), expandedData1);
//		    // print results
//		    TransformerFactory.newInstance()
//		        .newTransformer()
//		        .transform(new DOMSource(doc1), new StreamResult(System.out));
	    
		} catch (XPathExpressionException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (ParserConfigurationException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (SAXException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (TransformerConfigurationException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (TransformerException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		
	}
	
	/*
	 * experiment: combine OCR and HTR: the longest string wins
	 * seems to bring best results
	 * TODO: calculate error rates and compare!
	 */
	
	private static File combineHTRAndOCR(File htrFile, File ocrFile, File convertedFile) {
		try {
			
			FileInputStream fileIsHTR = new FileInputStream(htrFile);
			FileInputStream fileIsOCR = new FileInputStream(ocrFile);
				
			DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance();
			DocumentBuilder builder = builderFactory.newDocumentBuilder();
			Document htrDocument = builder.parse(fileIsHTR);
			Document ocrDocument = builder.parse(fileIsOCR);			
			
			XPath xPath = XPathFactory.newInstance().newXPath();
			String expressionHTR = "//TextLine/TextEquiv/Unicode";
			String expressionOCR = "//block[@blockType='Text']//line";
			String expressionOCRTables = "//block[@blockType='Table']//line";
			//xPath: get all charParams for current node
			String expAllCharParamsOfLine = ".//charParams";
			
			
			NodeList nodeListOCR = (NodeList) xPath.compile(expressionOCR).evaluate(ocrDocument, XPathConstants.NODESET);
			NodeList nodeListOCRTables = (NodeList) xPath.compile(expressionOCRTables).evaluate(ocrDocument, XPathConstants.NODESET);
			NodeList nodeListHTR = (NodeList) xPath.compile(expressionHTR).evaluate(htrDocument, XPathConstants.NODESET);
			

			
			int countEqualLines = 0;
			int countUnequalLines = 0;
			int countEqualLineLengths = 0;
			int countUnequalLinesLenths = 0;
			
//			String test = "test";
//			for (char c : test.toCharArray()){
//				System.out.println(c);
//				
//			}
			
			int tableNr = 0;
			//System.out.println("result is " + result);
			//		// and iterate on links
			//System.out.println("nodeList length " + nodeListHTR.getLength());
			for (int i = 0; i < nodeListHTR.getLength(); i++){
				
				String htrLineString = nodeListHTR.item(i).getTextContent();
				
//				NodeList formatting = nodeListOCR.item(i).getChildNodes();

//				for (int j = 0; j<formatting.getLength(); j++){
//					//NodeList charParams = formatting.item(j).getChildNodes();
//					
//					logger.debug("node " + formatting.item(j));
				
					Node ocrLine = (Node) nodeListOCR.item(i);
					Node formatting = null;
					if (ocrLine != null){
						//logger.debug("ocr line not null");
						formatting = ocrLine.getLastChild();
					}
					else{
						logger.debug("ocr line is null - try table lines");
						ocrLine = (Node) nodeListOCRTables.item(tableNr++);
					}
					
					if (ocrLine == null){
						logger.debug("ocr line is null");
						logger.debug("htr file name: " + htrFile.getName());
						logger.debug("ocr file name: " + ocrFile.getName());
					}
					//logger.debug(ocrLine.getLastChild().getNodeName());
					
					String left = "", right = "", top = "", bottom = "";
							
		        	NamedNodeMap ocrLineAttributes = ocrLine.getAttributes();
		        	for (int l = 0; l<ocrLineAttributes.getLength(); l++){
		        		Node currAttr = ocrLineAttributes.item(l);
		        		//logger.debug("node map name " + currAttr.getNodeName());
		        		switch (currAttr.getNodeName()){
			        		case "l": left=currAttr.getNodeValue();break;
			        		case "r": right=currAttr.getNodeValue();break;
			        		case "t": top=currAttr.getNodeValue();break;
			        		case "b": bottom=currAttr.getNodeValue();break;
			        		default: break;
		        		}
		        	}
		        	
		        	int lineLeft = Integer.valueOf(left);
		        	int lineRight = Integer.valueOf(right);
		        	int lineTop = Integer.valueOf(top);
		        	int lineBottom = Integer.valueOf(bottom);
				
					NodeList nlCharParams = (NodeList) xPath.compile(expAllCharParamsOfLine).evaluate(ocrLine, XPathConstants.NODESET);
//					logger.debug("ocr length: " + nlCharParams.getLength());
//					logger.debug("htr length: " + htrLineString.length());
									
					String ocrLineString = "";
					String penaltyString = "";
					boolean ocrIsWorse = false;
					for (int k = 0; k<nlCharParams.getLength(); k++){
						ocrLineString += nlCharParams.item(k).getTextContent();
//	        			Node penalty = nlCharParams.item(k).getAttributes().getNamedItem("wordPenalty");
//	        			Node charConf = nlCharParams.item(k).getAttributes().getNamedItem("charConfidence");
//	        			
//	        			if (penalty != null){
//	        				penaltyString = penalty.getTextContent();
//	        			}
//	        			
//	        			if (penalty != null && Integer.valueOf(penalty.getTextContent())>20){
//	        				logger.debug("word Penalty: " + penalty.getTextContent());
//	        				
//	        				ocrIsWorse = true;
//	        			}
//	        			if (charConf != null && charConf.getTextContent().equals("-1")){
//	        				logger.debug("char Confidence: " + charConf.getTextContent());
//	        				ocrIsWorse = true;
//	        			}
					}
					
//					logger.debug("htr line text: " + htrLineString);
//					logger.debug("ocr line text: " + ocrLineString);
					

					
					if (htrLineString.length() == ocrLineString.length()){
						countEqualLineLengths++;
					}
					else{
						countUnequalLinesLenths++;
					}
					
					if (htrLineString.equals(ocrLineString)){
//						logger.debug("ocr and htr are the same - keep charparams as they are!!");
						countEqualLines++;	
						continue;
					}
					
					countUnequalLines++;
					
					/*
					 * this HTR correction with better OCR does not bring better results for an already good HTR
					 */
					
					//if (ocrLineString.replaceAll("\\s","").length() > htrLineString.replaceAll("\\s","").length()){
//					if (ocrLineString.length() > htrLineString.length()
//							&& (ocrLineString.length() - htrLineString.length()>2)){
//						logger.debug("OCR String longer then HTR string - OCR could be better");
//						logger.debug("is ocr better? " + !ocrIsWorse);
//						//System.in.read();
//						//continue;
//					}
//					
//					if (ocrLineString.length() > htrLineString.length()
//							&& (ocrLineString.length() - htrLineString
//									.length()>2) && !ocrIsWorse){
//						logger.debug("OCR String longer then HTR string - OCR wins");
//						logger.debug("penaltyString " + penaltyString);
//						//System.in.read();
//						continue;
//					}
										
//					if(wordPenalty){
//						System.in.read();
//					}
					
					boolean differentLength = nlCharParams.getLength() != htrLineString.length();
					
					int nrOfChars = (htrLineString.length()>0 ? htrLineString.length() : ocrLineString.length());
					double newCharWidth = Math.ceil(( (Integer.valueOf(right)-Integer.valueOf(left))/nrOfChars));
					
					//logger.debug("new CharWidth is " + newCharWidth);
										
//					if (ocrLineString.length() > htrLineString.length()){
//						System.in.read();
//					}
					
					for (int k = 0; k<nlCharParams.getLength(); k++){
						
						Node charParamInOCR = nlCharParams.item(k);
						
			        	Character ocrChar = nlCharParams.item(k).getTextContent().charAt(0);
			        	Character htrChar = k<htrLineString.length()? htrLineString.charAt(k) : null;

//			        	NamedNodeMap nodeMap = charParamInOCR.getAttributes();
//			        	for (int l = 0; l<nodeMap.getLength(); l++){
//			        		logger.debug("node map name " + nodeMap.item(l).getNodeName());
//			        		logger.debug("node map " + nodeMap.item(l).getNodeValue());
//			        	}
						
						/*
						 * cases to be considered
						 * (1) ocr line shorter than htr -> add charparams for all additional chars
						 * (2) htr line shorter than ocr -> delete needless chars
						 * (3) equal size -> only replace text content for unequal chars
						 * (4) empty char in htr text but not in ocr
						 */
			        	
			        	//case 2: pointless ocr charParams need to be deleted
			        	if(htrChar == null){
//			        		logger.debug("remove child: " + charParamInOCR.getTextContent());
			        		
			        		//text node of this element needs to be removed as well - otherwise you get empty lines in xml
			                Node prevElem = charParamInOCR.getPreviousSibling();
			                if (prevElem != null && 
			                    prevElem .getNodeType() == Node.TEXT_NODE &&
			                    prevElem .getNodeValue().trim().length() == 0) {
			                	charParamInOCR.getParentNode().removeChild(prevElem);
			                }
			        		//System.in.read();
			        		charParamInOCR.getParentNode().removeChild(charParamInOCR);
			        		
			        		
			        		
			        		continue;
			        	}
			        	
		        		//if we deal with different length of htr and ocr we newly calculate the coordinates of the bounding box of each character 
		        		if (differentLength){
		        			int l = (int) (lineLeft+newCharWidth*k);
		        			int r = (int) (lineLeft+newCharWidth*(k+1));
		        			
		        			Node leftCoord = charParamInOCR.getAttributes().getNamedItem("l");
		        			leftCoord.setNodeValue(Integer.toString(l));
		        			
		        			Node rightCoord = charParamInOCR.getAttributes().getNamedItem("r");
		        			rightCoord.setNodeValue(Integer.toString(r));
		        		}
			        	
			        	//if chars differ: replace ocr with htr (includes case 3, valid for case 1 and 2 too)
			        	if (ocrChar != htrChar){

			        		//empty space - insert and add wordStart = 1;
			        		if (Character.isSpaceChar(htrChar)){//.equals("\u0020")){
			        			if (charParamInOCR.getNextSibling() != null && charParamInOCR.getNextSibling().getNextSibling() != null){
				        			Node wordStart = charParamInOCR.getNextSibling().getNextSibling().getAttributes().getNamedItem("wordStart");
				        			if (wordStart != null){
				        				wordStart.setNodeValue("1");
				        			}
				        			else{
				        				Element currNode = (Element) charParamInOCR.getNextSibling().getNextSibling();
				        				currNode.setAttribute("wordStart", "1");
				        			}
				        			
			        			}
			        		}
			        		//set wordStart=0
			        		else if (Character.isSpaceChar(ocrChar)){
			        			if (charParamInOCR.getNextSibling() != null && charParamInOCR.getNextSibling().getNextSibling() != null){
			        				//logger.debug(charParamInOCR.getNextSibling().getNextSibling().getNodeName());
				        			Node wordStart = charParamInOCR.getNextSibling().getNextSibling().getAttributes().getNamedItem("wordStart");
				        			if (wordStart != null){
				        				wordStart.setNodeValue("0");
				        			}
				        			else{
				        				Element currNode = (Element) charParamInOCR.getNextSibling().getNextSibling();
				        				currNode.setAttribute("wordStart", "0");
				        			}
			        			}
			        		}

			        		deleteAllObsoleteAtttributes(charParamInOCR);
			        		
			        		
			    		   //Element additionalCharParam = ocrDocument.createElement("charParams");
			        		
			        		//test if this is sufficient
			        		
			        		charParamInOCR.setTextContent(htrChar.toString());
 			        		
			    		   //charParamInOCR.getParentNode().replaceChild(charParamInOCR, charParamInOCR);
			        	}
			        	
						
						//nlCharParams.item(k).getParentNode().replaceChild(newChild, oldChild)
//						logger.debug("node name: " + nlCharParams.item(k).getNodeName());
//						if (nlCharParams.item(k).getNodeName().equals("charParams"))
//							logger.debug("charParams: " + nlCharParams.item(k).getTextContent());
//						if (k<htrLineString.length())
//							logger.debug("vs. HTR char " + htrLineString.charAt(k));
					}
					
					//case 1:
					int remaining = htrLineString.length()-ocrLineString.length();
					if (remaining > 0){
						//logger.debug("append node to line " + ocrLine.getTextContent());
						for (int j = ocrLineString.length(); j<htrLineString.length(); j++){
							
							/*
							 * calculate the left and right coordinates from lineWidth/numberOfHTRChars
							 * top and bottom are taken from line
							 */
							
		        			int l = (int) (lineLeft+newCharWidth*j);
		        			int r = (int) (lineLeft+newCharWidth*(j+1));
		        			
							Character htrChar = htrLineString.charAt(j);
							Element additionalCharParam = ocrDocument.createElement("charParams");
							additionalCharParam.setAttribute("l", Integer.toString(l));
							additionalCharParam.setAttribute("t", top);
							additionalCharParam.setAttribute("r", Integer.toString(r));
							additionalCharParam.setAttribute("b", bottom);
			    		   			    		   
							additionalCharParam.appendChild(ocrDocument.createTextNode(htrChar.toString()));
							
							if (formatting != null){
								formatting.appendChild(additionalCharParam);
								//logger.debug("node added " + node.getTextContent());
								//System.in.read();
							}
							else{
								ocrLine.appendChild(additionalCharParam);
							}
						}
						
					}
				//}
				
//				for (char c : currLineString.toCharArray()){
//					
//				}
				//logger.debug("ocr line length: " + nodeListOCR.item(i).getTextContent());
				//logger.debug("child nodes of line: " + nodeListOCR.item(i).getFirstChild().getTextContent());
				//logger.debug("ocr line length: " + nodeListOCR.item(i).getTextContent().length());
			}
			
	       TransformerFactory tff  = TransformerFactory.newInstance();
	       Transformer transformer = tff.newTransformer();
	       	      
	       DOMSource xmlSource = new DOMSource(ocrDocument);
	       
	       //File convertedAbbyyXml = new File(htrFile.getParentFile().getParentFile().getAbsolutePath() + File.separatorChar +"ocr" + File.separatorChar + htrFile.getName());
	       StreamResult outputTarget = new StreamResult(convertedFile);
	       
	       transformer.transform(xmlSource, outputTarget);
			
			logger.debug("Statistics of file " + ocrFile.getAbsolutePath() );
			logger.debug("countEqualLines: "+countEqualLines);
			logger.debug("countUnequalLines: " +countUnequalLines);
			logger.debug("countEqualLineLengths: " +countEqualLineLengths);
			logger.debug("countUnequalLinesLenths: " + countUnequalLinesLenths);
			
			//System.in.read();
			
			return convertedFile;
						
			
//			System.out.println("nodeListOCR length " + nodeListOCR.getLength());
//			for (int i = 0; i < nodeListOCR.getLength(); i++){
//				logger.debug("node value: " + nodeListOCR.item(i).getTextContent().length());
//			}
			

//		    Document doc1 = expandedData1.getOwnerDocument();
//		    // insert the nodes
//		    Node expandedData2 = (Node) xpath.evaluate("//expandedData", ocr, NODE);
//		    expandedData1.getParentNode()
//		        .replaceChild(doc1.adoptNode(expandedData2), expandedData1);
//		    // print results
//		    TransformerFactory.newInstance()
//		        .newTransformer()
//		        .transform(new DOMSource(doc1), new StreamResult(System.out));
	    
		} catch (XPathExpressionException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (ParserConfigurationException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (SAXException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (TransformerConfigurationException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (TransformerException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		
		return null;
		
	}
	
	private static void deleteAllObsoleteAtttributes(Node charParamInOCR) throws IOException
 {
				
		//logger.debug("deleteAllObsoleteAtttributes " + charParamInOCR.getTextContent());
		
    	NamedNodeMap charParamAttributes = charParamInOCR.getAttributes();
    	
    	
    	ArrayList<String> attrName =new ArrayList<String>(); 
		/*
		 * copy the attribute names - charParamAttributes shrinks with removing the attributes
		 * and with the attribute names we can find and remove the attribute
		 */
    	for (int l = 0; l<charParamAttributes.getLength(); l++){
    		attrName.add(charParamAttributes.item(l).getNodeName());
    	}    	
    	
    	for (int l = 0; l<attrName.size(); l++){
    		
//    		logger.debug("l " + l);
//    		logger.debug("curr Attr: " + attrName.get(l));
    		
    		if (!attrName.get(l).matches("l|r|b|t|wordStart")){
    			//logger.debug("delete: " + charParamAttributes.getNamedItem(attrName.get(l)));
    			charParamAttributes.removeNamedItem(attrName.get(l));
    		}
    	}
    	//System.in.read();
	}
	
	private static void compareVersions() {
		 StringUtils.getLevenshteinDistance("fly", "ant");
		
	}


	/**
	 * Method will create a PAGE XML from the given source files at pageOutFile.
	 * if no supported source file exists (abbyy/alto/txt), then a skeleton will be created if possible.
	 * <br/><br/>
	 * This method must NEVER return null. Many mechanisms in Transkribus
	 * depend on this method reliably creating a file.
	 * 
	 * @param pageOutFile
	 * @param doOverwrite
	 * @param abbyyXml
	 * @param altoXml
	 * @param txtFile
	 * @param preserveOcrFontFamily
	 * @param preserveOcrTxtStyles
	 * @param replaceBadChars
	 * @param imgFile
	 * @param dim
	 * @return
	 * @throws FileNotFoundException
	 * @throws IOException
	 */
	protected static File createPageXml(File pageOutFile, boolean doOverwrite, File abbyyXml, boolean preserveOcrFontFamily, boolean preserveOcrTxtStyles, 
			boolean replaceBadChars, final String imgFileName, Dimension dim) 
					throws FileNotFoundException, IOException {
		if(pageOutFile == null) {
			throw new IllegalArgumentException("PAGE XML output File is null.");
		}
		if(pageOutFile.exists() && !doOverwrite) {
			throw new IOException("PAGE XML already exists at: " + pageOutFile.getAbsolutePath());
		}
		if(StringUtils.isEmpty(imgFileName)) {
			throw new IllegalArgumentException("Image filename must not be empty");
		}
		
		PcGtsType pc = null;
		
		if(abbyyXml != null){
			//try find Abbyy XML
			pc = createPageFromAbbyy(imgFileName, abbyyXml, preserveOcrTxtStyles, preserveOcrFontFamily, replaceBadChars);
		}
				
		//from here we need the dimension of the image
		if(dim == null) {
			//set (0,0) here in order to make the following work
			dim = new Dimension();
		}
				
		//if still null, there is no suitable file for this page yet => create one
		if (pc == null) {
			logger.warn("No Transcript XML found for img: " + FilenameUtils.getBaseName(imgFileName));
			logger.info("Creating empty PageXml.");
			pc = PageXmlUtils.createEmptyPcGtsType(imgFileName, dim);
		}
		
		//create the file
		try{
			JaxbUtils.marshalToFile(pc, pageOutFile);
		} catch (JAXBException je) {
			throw new IOException("Could not create PageXml on disk!", je);
		}
		
		return pageOutFile;
	}
	
	private static PcGtsType createPageFromAbbyy(final String imgFileName, File abbyyXml, boolean preserveOcrTxtStyles,
			boolean preserveOcrFontFamily, boolean replaceBadChars) throws IOException {
		try{
			XmlFormat xmlFormat = XmlUtils.getXmlFormat(abbyyXml);
			if(xmlFormat.equals(XmlFormat.ABBYY_10)){
				logger.info(abbyyXml.getAbsolutePath() + ": Transforming Finereader10/11 XML to PAGE XML.");
				PcGtsType pc = PageXmlUtils.createPcGtsTypeFromAbbyy(
						abbyyXml, imgFileName, 
						preserveOcrTxtStyles, preserveOcrFontFamily, 
						replaceBadChars
						);
				return pc;
			}
			throw new IOException("Not a valid Finereader10/11 XML file.");
		} catch(IOException | TransformerException ioe){
			logger.error(ioe.getMessage(), ioe);
			throw new IOException("Could not migrate file: " + abbyyXml.getAbsolutePath(), ioe);
		} catch (ParserConfigurationException | SAXException xmle) {
			logger.error(xmle.getMessage(), xmle);
			throw new IOException("Could not transform XML file!", xmle);
		} catch (JAXBException je) {
			/* TODO This exception is only thrown when the pageXML is unmarshalled 
			 * for inserting the image filename which is not included in the abbyy xml! */
			logger.error(je.getMessage(), je);
			throw new IOException("Transformation output is not a valid page XML!", je);
		}
	}

	private static void usage() {
		System.out.println("Use: java -jar jarFileName htrDirectoryName ocrDirectoryName\n"
				+ "folders contain htr and ocr results and must contain the same number of files!");
		return;		
	}
	
	public static void displayFiles(File[] files) {
		for (File file : files) {
			System.out.printf("File: %-20s Last Modified:" + "\n", file.getName());
		}
	}

}