java source code of DjVuText

//C- -------------------------------------------------------------------
//C- Java DjVu (r) (v. 0.8)
//C- Copyright (c) 2004-2005 LizardTech, Inc.  All Rights Reserved.
//C- Java DjVu is protected by U.S. Pat. No.C- 6,058,214 and patents
//C- pending.
//C-
//C- This software is subject to, and may be distributed under, the
//C- GNU General Public License, Version 2. The license should have
//C- accompanied the software or you may obtain a copy of the license
//C- from the Free Software Foundation at http://www.fsf.org .
//C-
//C- The computer code originally released by LizardTech under this
//C- license and unmodified by other parties is deemed "the LIZARDTECH
//C- ORIGINAL CODE."  Subject to any third party intellectual property
//C- claims, LizardTech grants recipient a worldwide, royalty-free,
//C- non-exclusive license to make, use, sell, or otherwise dispose of
//C- the LIZARDTECH ORIGINAL CODE or of programs derived from the
//C- LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU
//C- General Public License.   This grant only confers the right to
//C- infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to
//C- the extent such infringement is reasonably necessary to enable
//C- recipient to make, have made, practice, sell, or otherwise dispose
//C- of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to
//C- any greater extent that may be necessary to utilize further
//C- modifications or combinations.
//C-
//C- The LIZARDTECH ORIGINAL CODE is provided "AS IS" WITHOUT WARRANTY
//C- OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
//C- TO ANY WARRANTY OF NON-INFRINGEMENT, OR ANY IMPLIED WARRANTY OF
//C- MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
//C-
//C- In addition, as a special exception, LizardTech Inc. gives permission
//C- to link the code of this program with the proprietary Java
//C- implementation provided by Sun (or other vendors as well), and
//C- distribute linked combinations including the two. You must obey the
//C- GNU General Public License in all respects for all of the code used
//C- other than the proprietary Java implementation. If you modify this
//C- file, you may extend this exception to your version of the file, but
//C- you are not obligated to do so. If you do not wish to do so, delete
//C- this exception statement from your version.
//C- -------------------------------------------------------------------
//C- Developed by Bill C. Riemers, Foxtrot Technologies Inc. as work for
//C- hire under US copyright laws.
//C- -------------------------------------------------------------------
//
package com.lizardtech.djvu.text;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;

import com.google.gwt.regexp.shared.RegExp;
import com.google.gwt.typedarrays.shared.ArrayBuffer;
import com.google.gwt.typedarrays.shared.TypedArrays;
import com.google.gwt.typedarrays.shared.Uint8Array;
import com.lizardtech.djvu.BSInputStream;
import com.lizardtech.djvu.CachedInputStream;
import com.lizardtech.djvu.Codec;
import com.lizardtech.djvu.GRect;
import com.lizardtech.djvu.InputStream;
import com.lizardtech.djvu.JsArrayList;
import com.lizardtech.djvu.NumContext;


/**
 * <p>
 * This class implements annotations understood by the DjVu plugins  and
 * encoders. using: contents of TXT chunks. Contents of the FORM:TEXT
 * should be passed to decode for parsing, which
 * initializes this class and fills in the decoded data.
 * </p><p>
 * Description of the text contained in a DjVu page.  This class contains
 * the textual data for the page.  It describes the text as a hierarchy of
 * zones corresponding to page, column, region, paragraph, lines, words,
 * etc... The piece of text associated with each zone is represented by an
 * offset and a length describing a segment of a global UTF8 encoded
 * byteArray.
 * </p><p>
 * Constants are used to tell what a zone describes. This can be
 * useful for a copy/paste application.  The deeper we go into the
 * hierarchy, the higher the constant.
 * </p>
 */
public class DjVuText
  implements Codec
{
  //~ Static fields/initializers ---------------------------------------------

  private static final RegExp NON_PRINTABLE = RegExp.compile("[\\0-\\x1F\\x7F-\\x9F\\u0378\\u0379\\u037F-\\u0383\\u038B\\u038D\\u03A2"
      + "\\u0528-\\u0530\\u0557\\u0558\\u0560\\u0588\\u058B-\\u058E\\u0590\\u05C8-\\u05CF\\u05EB-\\u05EF"
      + "\\u05F5-\\u0605\\u061C\\u061D\\u06DD\\u070E\\u070F\\u074B\\u074C\\u07B2-\\u07BF\\u07FB-\\u07FF"
      + "\\u082E\\u082F\\u083F\\u085C\\u085D\\u085F-\\u089F\\u08A1\\u08AD-\\u08E3\\u08FF\\u0978\\u0980"
      + "\\u0984\\u098D\\u098E\\u0991\\u0992\\u09A9\\u09B1\\u09B3-\\u09B5\\u09BA\\u09BB\\u09C5\\u09C6\\u09C9"
      + "\\u09CA\\u09CF-\\u09D6\\u09D8-\\u09DB\\u09DE\\u09E4\\u09E5\\u09FC-\\u0A00\\u0A04\\u0A0B-\\u0A0E"
      + "\\u0A11\\u0A12\\u0A29\\u0A31\\u0A34\\u0A37\\u0A3A\\u0A3B\\u0A3D\\u0A43-\\u0A46\\u0A49\\u0A4A"
      + "\\u0A4E-\\u0A50\\u0A52-\\u0A58\\u0A5D\\u0A5F-\\u0A65\\u0A76-\\u0A80\\u0A84\\u0A8E\\u0A92\\u0AA9\\u0AB1"
      + "\\u0AB4\\u0ABA\\u0ABB\\u0AC6\\u0ACA\\u0ACE\\u0ACF\\u0AD1-\\u0ADF\\u0AE4\\u0AE5\\u0AF2-\\u0B00\\u0B04"
      + "\\u0B0D\\u0B0E\\u0B11\\u0B12\\u0B29\\u0B31\\u0B34\\u0B3A\\u0B3B\\u0B45\\u0B46\\u0B49\\u0B4A"
      + "\\u0B4E-\\u0B55\\u0B58-\\u0B5B\\u0B5E\\u0B64\\u0B65\\u0B78-\\u0B81\\u0B84\\u0B8B-\\u0B8D\\u0B91"
      + "\\u0B96-\\u0B98\\u0B9B\\u0B9D\\u0BA0-\\u0BA2\\u0BA5-\\u0BA7\\u0BAB-\\u0BAD\\u0BBA-\\u0BBD\\u0BC3-\\u0BC5"
      + "\\u0BC9\\u0BCE\\u0BCF\\u0BD1-\\u0BD6\\u0BD8-\\u0BE5\\u0BFB-\\u0C00\\u0C04\\u0C0D\\u0C11\\u0C29\\u0C34"
      + "\\u0C3A-\\u0C3C\\u0C45\\u0C49\\u0C4E-\\u0C54\\u0C57\\u0C5A-\\u0C5F\\u0C64\\u0C65\\u0C70-\\u0C77\\u0C80"
      + "\\u0C81\\u0C84\\u0C8D\\u0C91\\u0CA9\\u0CB4\\u0CBA\\u0CBB\\u0CC5\\u0CC9\\u0CCE-\\u0CD4\\u0CD7-\\u0CDD"
      + "\\u0CDF\\u0CE4\\u0CE5\\u0CF0\\u0CF3-\\u0D01\\u0D04\\u0D0D\\u0D11\\u0D3B\\u0D3C\\u0D45\\u0D49"
      + "\\u0D4F-\\u0D56\\u0D58-\\u0D5F\\u0D64\\u0D65\\u0D76-\\u0D78\\u0D80\\u0D81\\u0D84\\u0D97-\\u0D99\\u0DB2"
      + "\\u0DBC\\u0DBE\\u0DBF\\u0DC7-\\u0DC9\\u0DCB-\\u0DCE\\u0DD5\\u0DD7\\u0DE0-\\u0DF1\\u0DF5-\\u0E00"
      + "\\u0E3B-\\u0E3E\\u0E5C-\\u0E80\\u0E83\\u0E85\\u0E86\\u0E89\\u0E8B\\u0E8C\\u0E8E-\\u0E93\\u0E98\\u0EA0"
      + "\\u0EA4\\u0EA6\\u0EA8\\u0EA9\\u0EAC\\u0EBA\\u0EBE\\u0EBF\\u0EC5\\u0EC7\\u0ECE\\u0ECF\\u0EDA\\u0EDB"
      + "\\u0EE0-\\u0EFF\\u0F48\\u0F6D-\\u0F70\\u0F98\\u0FBD\\u0FCD\\u0FDB-\\u0FFF\\u10C6\\u10C8-\\u10CC\\u10CE"
      + "\\u10CF\\u1249\\u124E\\u124F\\u1257\\u1259\\u125E\\u125F\\u1289\\u128E\\u128F\\u12B1\\u12B6\\u12B7"
      + "\\u12BF\\u12C1\\u12C6\\u12C7\\u12D7\\u1311\\u1316\\u1317\\u135B\\u135C\\u137D-\\u137F\\u139A-\\u139F"
      + "\\u13F5-\\u13FF\\u169D-\\u169F\\u16F1-\\u16FF\\u170D\\u1715-\\u171F\\u1737-\\u173F\\u1754-\\u175F"
      + "\\u176D\\u1771\\u1774-\\u177F\\u17DE\\u17DF\\u17EA-\\u17EF\\u17FA-\\u17FF\\u180F\\u181A-\\u181F"
      + "\\u1878-\\u187F\\u18AB-\\u18AF\\u18F6-\\u18FF\\u191D-\\u191F\\u192C-\\u192F\\u193C-\\u193F\\u1941-\\u1943"
      + "\\u196E\\u196F\\u1975-\\u197F\\u19AC-\\u19AF\\u19CA-\\u19CF\\u19DB-\\u19DD\\u1A1C\\u1A1D\\u1A5F\\u1A7D"
      + "\\u1A7E\\u1A8A-\\u1A8F\\u1A9A-\\u1A9F\\u1AAE-\\u1AFF\\u1B4C-\\u1B4F\\u1B7D-\\u1B7F\\u1BF4-\\u1BFB"
      + "\\u1C38-\\u1C3A\\u1C4A-\\u1C4C\\u1C80-\\u1CBF\\u1CC8-\\u1CCF\\u1CF7-\\u1CFF\\u1DE7-\\u1DFB\\u1F16\\u1F17"
      + "\\u1F1E\\u1F1F\\u1F46\\u1F47\\u1F4E\\u1F4F\\u1F58\\u1F5A\\u1F5C\\u1F5E\\u1F7E\\u1F7F\\u1FB5\\u1FC5\\u1FD4"
      + "\\u1FD5\\u1FDC\\u1FF0\\u1FF1\\u1FF5\\u1FFF\\u200B-\\u200F\\u202A-\\u202E\\u2060-\\u206F\\u2072\\u2073"
      + "\\u208F\\u209D-\\u209F\\u20BB-\\u20CF\\u20F1-\\u20FF\\u218A-\\u218F\\u23F4-\\u23FF\\u2427-\\u243F"
      + "\\u244B-\\u245F\\u2700\\u2B4D-\\u2B4F\\u2B5A-\\u2BFF\\u2C2F\\u2C5F\\u2CF4-\\u2CF8\\u2D26\\u2D28-\\u2D2C"
      + "\\u2D2E\\u2D2F\\u2D68-\\u2D6E\\u2D71-\\u2D7E\\u2D97-\\u2D9F\\u2DA7\\u2DAF\\u2DB7\\u2DBF\\u2DC7\\u2DCF"
      + "\\u2DD7\\u2DDF\\u2E3C-\\u2E7F\\u2E9A\\u2EF4-\\u2EFF\\u2FD6-\\u2FEF\\u2FFC-\\u2FFF\\u3040\\u3097\\u3098"
      + "\\u3100-\\u3104\\u312E-\\u3130\\u318F\\u31BB-\\u31BF\\u31E4-\\u31EF\\u321F\\u32FF\\u4DB6-\\u4DBF"
      + "\\u9FCD-\\u9FFF\\uA48D-\\uA48F\\uA4C7-\\uA4CF\\uA62C-\\uA63F\\uA698-\\uA69E\\uA6F8-\\uA6FF\\uA78F"
      + "\\uA794-\\uA79F\\uA7AB-\\uA7F7\\uA82C-\\uA82F\\uA83A-\\uA83F\\uA878-\\uA87F\\uA8C5-\\uA8CD\\uA8DA-\\uA8DF"
      + "\\uA8FC-\\uA8FF\\uA954-\\uA95E\\uA97D-\\uA97F\\uA9CE\\uA9DA-\\uA9DD\\uA9E0-\\uA9FF\\uAA37-\\uAA3F\\uAA4E"
      + "\\uAA4F\\uAA5A\\uAA5B\\uAA7C-\\uAA7F\\uAAC3-\\uAADA\\uAAF7-\\uAB00\\uAB07\\uAB08\\uAB0F\\uAB10"
      + "\\uAB17-\\uAB1F\\uAB27\\uAB2F-\\uABBF\\uABEE\\uABEF\\uABFA-\\uABFF\\uD7A4-\\uD7AF\\uD7C7-\\uD7CA"
      + "\\uD7FC-\\uF8FF\\uFA6E\\uFA6F\\uFADA-\\uFAFF\\uFB07-\\uFB12\\uFB18-\\uFB1C\\uFB37\\uFB3D\\uFB3F\\uFB42"
      + "\\uFB45\\uFBC2-\\uFBD2\\uFD40-\\uFD4F\\uFD90\\uFD91\\uFDC8-\\uFDEF\\uFDFE\\uFDFF\\uFE1A-\\uFE1F"
      + "\\uFE27-\\uFE2F\\uFE53\\uFE67\\uFE6C-\\uFE6F\\uFE75\\uFEFD-\\uFF00\\uFFBF-\\uFFC1\\uFFC8\\uFFC9\\uFFD0"
      + "\\uFFD1\\uFFD8\\uFFD9\\uFFDD-\\uFFDF\\uFFE7\\uFFEF-\\uFFFB\\uFFFE\\uFFFF]", "g");
  private static final  RegExp SOFT_HYPHEN = RegExp.compile("\\xAD\\s+$");

  /** Indicates a page zone. */
  public static final int PAGE = 1;

  /** Indicates a column zone. */
  public static final int COLUMN = 2;

  /** Indicates a region zone. */
  public static final int REGION = 3;

  /** Indicates a paragraph zone. */
  public static final int PARAGRAPH = 4;

  /** Indicates a line zone. */
  public static final int LINE = 5;

  /** Indicates a word zone. */
  public static final int WORD = 6;

  /** Indicates a character zone. */
  public static final int CHARACTER = 7;

  /** VT: Vertical Tab */
  public static final int end_of_column = 0x0b;

  /** GS: Group Separator */
  public static final int end_of_region = 0x1d;

  /** US: Unit Separator */
  public static final int end_of_paragraph = 0x1f;

  /** LF: Line Feed */
  public static final int end_of_line = 0x0a;

  //~ Instance fields --------------------------------------------------------

  /** Main zone in the document. This zone represent the page. */
  public Zone page_zone = new Zone();

  /**
   * Textual data for this page.   The content of this byteArray is encoded
   * using the UTF8 code. This code corresponds to ASCII for the first 127
   * characters. Columns, regions, paragraph and lines are delimited by the
   * following control character:
   * 
   * <table>
   * <tr><th>Name</td><th>Octal</td><th>Ascii name</td></tr>
   * <tr><td>DjVuText.end_of_column</td><td>013</td><td>VT, Vertical Tab</td></tr>
   * <tr><td>DjVuText.end_of_region</td><td>035</td><td>GS, Group Separator</td></tr>
   * <tr><td>DjVuText.end_of_paragraph</td><td>037</td><td>US, Unit Separator</td></tr>
   * <tr><td>DjVuText.end_of_line</td><td>012</td><td>LF: Line Feed</td></tr>
   * </table>
   */
  private Uint8Array textByteArray;
  private ArrayBuffer textByteBuffer;

  //~ Constructors -----------------------------------------------------------

	public DjVuText() {
		setTextByteArray(new byte[0]);
	}

	public DjVuText(DjVuText toCopy) {
		this.textByteBuffer = toCopy.textByteBuffer;
		this.textByteArray = toCopy.textByteArray;
		this.page_zone = new Zone(toCopy.page_zone);
	}

  //~ Methods ----------------------------------------------------------------

  /**
   * Query if this is image data.
   *
   * @return false
   */
  @Override
public boolean isImageData()
  { 
      return false;
  }

  /**
   * Count the number of characters.
   *
   * @param from byte position to start counting from
   * @param end byte position to stop counting
   *
   * @return The number of characters and start of characters in the range.
   */
  public int getLength(
    final int from,
    final int end)
  {
    int pos    = from;
    int retval = 0;

    for(; (pos < end) && (pos < textByteArray.length()); retval++)
    {
      pos = nextChar(textByteArray, pos);
    }

    return retval;
  }

  /**
   * Query the string from the specified range of bytes.
   *
   * @param start byte position of the first character.
   * @param end byte position to end the string
   *
   * @return The converted string
   */
  public String getString(
    int start,
    int end)
  {
      try
      {
        byte[] array = new byte[end - start];
        for (int i = 0; i < array.length; i++)
          array[i] = (byte) textByteArray.get(i + start);
        String result = new String(array, "UTF-8");
        result = SOFT_HYPHEN.replace(result, "\u2010" /* hard hyphen */);
        result = NON_PRINTABLE.replace(result, "?");
        return result;
      }
      catch(UnsupportedEncodingException exp)
      {
        throw new RuntimeException();
      }
      catch(IllegalArgumentException e) {
        char[] chars = new char[end - start];
        for (int i = 0; i < end - start; i++) {
            chars[i] = (char) (textByteArray.get(start + i) & 0xFF);
            if (chars[i] < 32 || chars[i] > 126)
                chars[i] = '?';
        }
        return new String(chars);
      }
  }

  /**
   * Set the text data from an array of bytes.
   *
   * @param textByteArray array of bytes to interpret
   */
  public void setTextByteArray(final byte[] textByteArray)
  {
    this.textByteArray = convertArray(textByteArray);
    this.textByteBuffer = this.textByteArray.buffer();
  }

	private Uint8Array convertArray(byte[] bytes) {
		Uint8Array result = TypedArrays.createUint8Array(bytes.length);
		for (int i = 0; i < bytes.length; i++)
			result.set(i, bytes[i]);
		return result;
	}

  /**
   * Decodes the hidden text layer TXT into internal representation. NOTE:
   * All separators (except word) are replaced with line feeds.
   *
   * @param input The chunk to decode.
   *
   * @throws IOException if an error occures.
   */
  @Override
public void decode(CachedInputStream input)
    throws IOException
  {
    if("TXTz".equals(input.getName()))
    {
      input=new CachedInputStream().init(
              new BSInputStream().init(input));
      input.setName("TXTa");
    }
    // Read text
    int          textsize = input.read24();

    final byte[] textByteArray = new byte[textsize];

    int          readsize = input.read(textByteArray);

    for(int s = 0; s < readsize; s++)
    {
      byte b = textByteArray[s];

      if(b == 0)
      {
        break;
      }

      switch(b)
      {
        case DjVuText.end_of_column :
        case DjVuText.end_of_region :
        case DjVuText.end_of_paragraph :
          textByteArray[s] = '\n';
      }
    }

    if(readsize < textsize)
    {
      while(readsize < textsize)
      {
        textByteArray[readsize++] = 0;
      }

      setTextByteArray(textByteArray);
      throw new IOException("DjVuText.corrupt_chunk");
    }

    setTextByteArray(textByteArray);

    // Try reading zones
    int version = input.read();

    if(version != -1)
    {
      if(version != Zone.version)
      {
        throw new IOException("DjVuText.bad_version=" + version);
      }

      page_zone.decode(input, textsize);
    }
  }

  /**
   * Find the text specified by the rectangles.
   *
   * @param box bounding box to search
   * @param text buffer to fill with the text found
   * @param padding number of pixels to add to each rectangle
   *
   * @return a vector of the smallest level rectangles representing the text found
   */
  public ArrayList<GRect> find_text_with_rect(
    GRect        box,
    StringBuffer text,
    int          padding)
  {
    ArrayList<GRect>     retval     = new ArrayList<>();
    NumContext text_start = new NumContext(0);
    NumContext text_end   = new NumContext(0);
    page_zone.get_text_with_rect(box, text_start, text_end);

    if(text_start.intValue() != text_end.intValue())
    {
      ArrayList<Zone> zones = new ArrayList<>();
      page_zone.append_zones(
        zones,
        text_start.intValue(),
        text_end.intValue());

      int pos = 0;

      if(pos < zones.size())
      {
        do
        {
          if(padding >= 0)
          {
            zones.get(pos).get_smallest(retval, padding);
          }
          else
          {
            zones.get(pos).get_smallest(retval);
          }
        }
        while((++pos) < zones.size());
      }
    }

    text.setLength(0);
    text.append(getString(
        text_start.intValue(),
        text_end.intValue()));

    return retval;
  }

  /**
   * Find the text specified by the rectangles.
   *
   * @param box bounding box to search
   * @param text buffer to fill with the text found
   *
   * @return a vector of the smallest level rectangles representing the text found
   */
  public ArrayList<GRect> find_text_with_rect(
    GRect        box,
    StringBuffer text)
  {
    return find_text_with_rect(box, text, 0);
  }

  /**
   * Get all zones of zone type zone_type under node parent. zone_list
   * contains the return value.
   *
   * @param zone_type the zone type to list.
   * @param parent parent zone to start from
   * @param zone_list vector to add the zones to
   */
  public void get_zones(
    int    zone_type,
    Zone   parent,
    ArrayList<Zone> zone_list)
  {
    // search all branches under parent
    Zone zone = parent;

    for(int cur_ztype = zone.ztype; cur_ztype < zone_type; ++cur_ztype)
    {
      for(int pos = 0; pos < zone.children.size(); ++pos)
      {
        Zone zcur = zone.children.get(pos);

        if(zcur.ztype == zone_type)
        {
          if(!zone_list.contains(zcur))
          {
            zone_list.add(zcur);
          }
        }
        else if(zone.children.get(pos).ztype < zone_type)
        {
          get_zones(zone_type, zone.children.get(pos), zone_list);
        }
      }
    }
  }

  /**
   * Tests whether there is a meaningful zone hierarchy.
   *
   * @return true if there are valid zones
   */
  public boolean has_valid_zones()
  {
    return !((textByteArray.length() == 0) || !page_zone.children.isEmpty()
    || page_zone.isEmpty());
  }

  /**
   * Searches a file for TXTz and TXTa chunks and decodes each of them.
   *
   * @param iff enumeration of CachedInputStream's to read.
   *
   * @return the initialized DjVuText object
   *
   * @throws IOException if an IO error occures.
   */
  public DjVuText init(final Enumeration<CachedInputStream> iff)
    throws IOException
  {
    if(iff != null)
    {
      while(iff.hasMoreElements())
      {
        CachedInputStream chunk=iff.nextElement();
        final String xchkid = chunk.getName();
        if(xchkid.startsWith("FORM:"))
        {
          init(chunk.getIFFChunks());
        }
        else if("TXTa".equals(xchkid)||"TXTz".equals(xchkid))
        {
          decode(chunk);
        }
      }
    }
    return this;
  }

  /**
   * Searches a file for TXTz and TXTa chunks and decodes each of them.
   *
   * @param pool input stream to read.
   *
   * @return the initialized DjVuText object
   *
   * @throws IOException if an IO error occures.
   */
  public DjVuText init(CachedInputStream pool)
    throws IOException
  {
    final Enumeration<CachedInputStream> e=pool.getIFFChunks();
    if(e != null)
    {
      return init(e);
    }
    decode(pool);
    return this;
  }

  /**
   * Get the number of bytes of hidden text.
   *
   * @return number of bytes
   */
  public int length()
  {
    return textByteArray.length();
  }

  /**
   * Searches the TXT chunk for the given byteArray. If the function manages
   * to find an occurrence of the string, it will return the start of the
   * text.  If no match has been found the retval will be -1.
   *
   * @param zone_list A list of smallest zones covering the text.
   * @param string String to be found. May contain spaces as word separators.
   * @param from Position returned by last search.  If from is out of bounds
   *        of textByteArray it will be set to -1 for searching forward and
   *        textByteArray.length() for searching backwards.
   * @param search_fwd TRUE means to search forward. FALSE - backward.
   * @param match_case If set to FALSE the search will be case-insensitive.
   * @param whole_word If set to TRUE the function will try to find a whole
   *        word matching the passed string. The word separators are all
   *        blank and punctuation characters. The passed string may
   *        <b>not</b> contain word separators, that is it <b>must</b> be a
   *        whole word.
   *
   * @return Start of text if found, otherwise -1.
   *
   * @throws IllegalArgumentException if no none-white spaces are specified in the search string
   */
  public int search_string(
    final ArrayList<Zone>  zone_list,
    final String  string,
    int           from,
    final boolean search_fwd,
    final boolean match_case,
    final boolean whole_word)
  {
    zone_list.clear();

    byte[] byteArray = null;

      try
      {
        byteArray =
          (match_case
          ? string
          : (string.toLowerCase())).trim().getBytes("UTF-8");
      }
      catch(final Throwable ignored) {}

    if(byteArray == null)
    {
      byteArray =
        (match_case
        ? string
        : (string.toLowerCase())).trim().getBytes();
    }

    // Make sure there's something left to search for
    if(byteArray.length == 0)
    {
      throw new IllegalArgumentException("DjVuText.one_word");
    }

    if(
      (textByteArray.length() == 0)
      || (byteArray.length > textByteArray.length()))
    {
      return -1;
    }

    if(search_fwd)
    {
      if((from < 0) || (from >= textByteArray.length()))
      {
        from = -1;
      }

      do
      {
        do
        {
          if(++from >= textByteArray.length())
          {
            return -1;
          }
        }
        while((textByteArray.get(from) & 0xc0) == 0x80);

        find_zones(zone_list, byteArray, from, whole_word, match_case);
      }
      while(zone_list.isEmpty());
    }
    else // search backward
    {
      if((from < 0) || (from >= textByteArray.length()))
      {
        from = textByteArray.length();
      }

      while(--from >= 0)
      {
        if((textByteArray.get(from) & 0xc0) != 0x80)
        {
          find_zones(zone_list, byteArray, from, whole_word, match_case);

          if(!zone_list.isEmpty())
          {
            return from;
          }
        }
      }
    }

    return from;
  }

  /**
   * Searches the TXT chunk for the given byteArray. If the function manages
   * to find an occurrence of the string, it will return the start of the
   * text.  If no match has been found the retval will be -1.  Does not try
   * to match the whole word.
   *
   * @param zone_list A list of smallest zones covering the text.
   * @param string String to be found. May contain spaces as word separators.
   * @param from Position returned by last search.  If from is out of bounds
   *        of textByteArray it will be set to -1 for searching forward and
   *        textByteArray.length() for searching backwards.
   * @param search_fwd TRUE means to search forward. FALSE - backward.
   * @param match_case If set to FALSE the search will be case-insensitive.
   *
   * @return Start of text if found, otherwise -1.
   *
   * @throws IllegalArgumentException if no none-white spaces are specified in the search string
   */
  public int search_string(
    final ArrayList<Zone>  zone_list,
    final String  string,
    final int     from,
    final boolean search_fwd,
    final boolean match_case)
  {
    return search_string(
      zone_list,
      string,
      from,
      search_fwd,
      match_case,
      false);
  }

  /**
   * Returns end position of the first character in string beyond the the
   * found string, if text contains the same words as the substring in the
   * same order (but possibly with different number of separators between
   * words). The 'separators' in this function are blank and 'end_of_...'
   * characters. If the text is not found then the initial from value will
   * be returned. NOTE, that the returned position may be different from
   * (substring.length+from) because of different number of spaces between
   * words in substring and string.
   *
   * @param substring string to search for
   * @param from start position
   * @param match_case true if case sensative
   *
   * @return end position if the substring is found
   */
  public int startsWith(
    final String        substring,
    final int     from,
    final boolean match_case)
  {
      try
      {
        return startsWith(
          (match_case
          ? substring
          : (substring.toLowerCase())).trim().getBytes("UTF-8"),
          from,
          match_case);
      }
      catch(final Throwable ignored) {}

    return startsWith(
      (match_case
      ? substring
      : (substring.toLowerCase())).trim().getBytes(),
      from,
      match_case);
  }

  /**
   * Query the entire text layer as a string
   *
   * @return the converted string
   */
  @Override
public String toString()
  {
    return getString(0, textByteArray.length());
  }

	public Object getTransferable() {
		return textByteBuffer;
	}

  // extract a utf8 encoded character from an array of bytes
  private int getChar(
    Uint8Array byteArray,
    int    pos)
  {
    int value = byteArray.get(pos++);
    if (value > 0 && (value >> 7) == 0)
      return value;

    switch(value & 0xc0)
    {
      case 0x80 :
        throw new IllegalStateException("Invalid UTF8");
      case 0x40 :
        return value;
      default :
        value = (value << 6) | (byteArray.get(pos++) & 0x7f);

        if((value & 0x800) == 0)
        {
          return value & 0x7ff;
        }

        value = (value << 6) | (byteArray.get(pos++) & 0x7f);

        if((value & 0x10000) == 0)
        {
          return value & 0xffff;
        }

        value = (value << 6) | (byteArray.get(pos++) & 0x7f);

        if((value & 0x200000) == 0)
        {
          return value & 0x1fffff;
        }

        value = (value << 6) | (byteArray.get(pos++) & 0x7f);

        if((value & 0x4000000) == 0)
        {
          return value & 0x3ffffff;
        }

        return (value << 6) | (byteArray.get(pos++) & 0x7f);
    }
  }

  // Returns TRUE, if the 'ch' is a separator, or is punctuation.
  private static boolean isJavaIdentifier(final char value)
  {
    return Character.isLetterOrDigit(value) || value == '$' || value == '_';
  }

  // Returns TRUE, if the 'ch' is a separator, or is punctuation.
  private static boolean isJavaIdentifier(final int value)
  {
    return ((value & 0xffff) == value) && isJavaIdentifier((char)value);
  }

  //*
  // Determine if the first UTF8 character in the byteArrays are
  // identical. Assume that the leading bytes have already been
  // determined to be equal.
  ///
  private static boolean char_equal(
    byte[] first,
    int    firstPos,
    Uint8Array second,
    int    secondPos)
  {    
    if(first[firstPos] != second.get(secondPos++))
    {
      return false;
    }

    if((first[firstPos++] & 0xc0) < 0x80)
    {
      return true;
    }

    // skip the first bytes (assumed to be equal)
    while(
      ((first[firstPos] & 0xc0) == 0x80)
      && ((second.get(secondPos) & 0xc0) == 0x80))
    {
      // both bytes are UTF8 continuation bytes
      if(first[firstPos++] != second.get(secondPos++))
      {
        return false;
      }
    }

    // All continuation bytes up to this position (if any) agree and 
    // at least one of the byteArrays has run out of continuation bytes.
    // The characters are equal if the current bytes are not
    // continuation bytes.
    return (((first[firstPos] & 0xc0) != 0x80)
    && ((second.get(secondPos) & 0xc0) != 0x80));
  }

  // test if the given value represents a white space character
  private static boolean isspace(final int value)
  {
    return (value == 0)
    || (((value & 0xffff) == value) && Character.isSpace((char)value));
  }

  // Find the next character in a utf8 encoded byte array
  private static int nextChar(
    Uint8Array byteArray,
    int    pos)
  {
    if(pos < byteArray.length())
    {
      while(++pos < byteArray.length())
      {
        if((byteArray.get(pos) & 0xc0) != 0x80)
        {
          return pos;
        }
      }
    }

    return pos;
  }

  //*
  // Determine if the substring is contained in the string beginning at
  // location "from". whole_word indicates whether the located substring
  // must begin and end on a word boundary.  If there is a match, return
  // the list of zones that contain the found copy; otherwise, return an
  // empty list.
  ///
  private void find_zones(
    final ArrayList<Zone> zone_list,
    final byte[] substring,
    int          from,
    boolean      whole_word,
    boolean      match_case)
  {
    zone_list.clear();

    // startsWith() will return true if the substring beginning at "from" begins
    // with punctuation. This can result in a false match, so we first check
    // that the leading characters are equal before we do the whole substring check.
    if(!char_equal(substring, 0, textByteArray, from))
    {
      if(!match_case)
      {
        final int c0 = getChar(convertArray(substring), 0);
        final int c1 = getChar(textByteArray, from);
        if(((c0 & 0xffff) != c0) 
            || ((c1 & 0xffff) != c1)
            || (Character.toUpperCase((char)c0) != Character.toUpperCase((char)c1)))
        {
          return;
        }
      }
      else
      {
        return;
      }
    }

    int end = startsWith(substring, from, match_case);

    if(end > from)
    {
      // Match found at this location, get a list of the zones
      // covering the substring.
      find_smallest_zones(zone_list, from, end - from);

      if(!whole_word || (zone_list.size() == 0))
      {
        // Not a whole word search, so we're done. Return the
        // list of zones.
        return;
      }

      // It's a whole word search and the zone isn't empty
      // Get the WORD zone that contains the beginning of the substring
      Zone first = zone_list.get(0);

      if(first == null)
      {
        // If we don't have a first element, then either we didn't get a
        // zone list, so return the empty list.
        return;
      }

      // The string may be defined at the character level, if so move
      // up to the WORD level if possible.
      if(first.ztype > WORD)
      {
        first = first.get_parent();

        if(first == null)
        {
          // The character level zone didn't have a parent. This should not happen...
          return;
        }
      }

      if(
        (first.text_start == from)
        || ((first.ztype != WORD)
        && !isJavaIdentifier(
          getChar(
            textByteArray,
            prevChar(textByteArray, from)))))
      // What does this test?
      //    The zone begins at the beginning of the substring OR
      //    if we don't have a WORD zone then we had a separator preceding the
      //    the substring.
      // In either of these cases, we'll say the beginning of the substring is okay and
      // go on to check the end.
      {
        // Get the last WORD zone covering the substring
        Zone last = zone_list.get(zone_list.size() - 1);

        if((last != null) && (last.ztype > WORD))
        {
          last = last.get_parent();
        }

        if(
          ((last != null) && ((last.text_start + last.text_length) == end))
          || !isJavaIdentifier(getChar(textByteArray, end)))
        {
          return;
        }
      }

      zone_list.clear();
    }

    return;
  }

  // Find the character after the first non-white place character.
  private int firstEndSpace(
    final Uint8Array byteArray,
    int          start,
    final int    length)
  {
    for(int pos = start + length; --pos >= start;)
    {
      if(
        ((byteArray.get(pos) & 0xc0) != 0x80)
        && !isspace(getChar(byteArray, pos)))
      {
        return nextChar(byteArray, pos);
      }
    }

    return start;
  }

  // find the start of the previous character
  private static int prevChar(
    Uint8Array byteArray,
    int    pos)
  {
    if(pos >= 0)
    {
      while(--pos >= 0)
      {
        if((byteArray.get(pos) & 0xc0) != 0x80)
        {
          return pos;
        }
      }
    }

    return pos;
  }

  //*
  // For the byteArray starting at byteArray_start of length length
  // the function will generate a list of smallest zones of the
  // same type that covers the byteArray and will return it.
  // The list of zones in order.
  ///
  private ArrayList<Zone> find_smallest_zones(
    ArrayList<Zone> zone_list,
    int    start,
    int    length)
  {
    if(zone_list == null)
    {
      zone_list = new ArrayList<>();
    }
    else
    {
      zone_list.clear();
    }

    int end = start + length;
    start   = skipSpaces(textByteArray, start, length);
    end     = firstEndSpace(textByteArray, start, end - start);

    if(start == end)
    {
      return zone_list; // nothing left, return the empty list
    }

    length = end - start;

    for(int zone_type = CHARACTER; zone_type >= PAGE;)
    {
      for(int xstart = start; xstart < end;)
      {
        // Locate the next non-space character. If none, we're
        // finished with the sweep of the byteArray.
        xstart = skipSpaces(textByteArray, xstart, length);

        if(xstart == end)
        {
          break;
        }

        // Locate the smallest zone of the type we're looking at
        // that begins at the current position.
        Zone zone = get_smallest_zone(zone_type, xstart);

        if((zone == null) || (zone_type != zone.ztype))
        {
          // We didn't find one. Move up the type hierarchy and try
          // again. Empty the zone list, first, though.
          zone_type--;
          zone_list.clear();

          break;
        }

        // We found one. Append it to the list and update the
        // the description.
        zone_list.add(zone);
        xstart = zone.text_start + zone.text_length;
      }

      if(zone_list.size() != 0)
      {
        // We got all the way through and produced a zone list
        // so we can stop hunting
        return zone_list;
      }
    }

    return zone_list;
  }

  // Find the smallest zone containing the start position.
  private Zone get_smallest_zone(
    int max_type,
    int start)
  {
    if(search_zone(page_zone, start) == start)
    {
      return null;
    }

    Zone zone = page_zone;

    while(zone.ztype < max_type)
    {
      int          pos      = 0;
      final List<Zone> children = zone.children;

      for(; pos < children.size(); ++pos)
      {
        if(search_zone(children.get(pos), start) > start)
        {
          break;
        }
      }

      if(pos >= children.size())
      {
        break;
      }

      zone = children.get(pos);
    }

    return zone;
  }

  // Find the next java identifier character
  private int nextJavaIdentifier(
    final Uint8Array byteArray,
    int          pos)
  {
    while(
      (pos < byteArray.length())
      && !isJavaIdentifier(getChar(byteArray, pos)))
    {
      pos = nextChar(byteArray, pos);
    }

    return pos;
  }

  //*
  // Will return the position of the first character beyond
  // the zone if the zone contains beginning of the text start,
  // otherwise will return the start position.
  ///
  private int search_zone(
    final Zone zone,
    final int  start)
  {
    final int zoneEnd = zone.text_start + zone.text_length;

    return ((start < zone.text_start) || (start >= zoneEnd))
    ? start
    : zoneEnd;
  }

  // find the next non-space
  private int skipSpaces(
    final Uint8Array byteArray,
    int          pos,
    final int    length)
  {
    while((pos < length) && isspace(getChar(byteArray, pos)))
    {
      pos = nextChar(byteArray, pos);
    }

    return pos;
  }

  // find the next non-space
  private int skipSpaces(
    final Uint8Array byteArray,
    int          pos)
  {
    return skipSpaces(byteArray, pos, byteArray.length());
  }

  // This is a more efficient internal version of startsWith, as it
  // takes as input the string already converted to a byte array.
  private int startsWith(
    byte[]        substringArray,
    final int     from,
    final boolean match_case)
  {
    if(substringArray.length == 0)
    {
      return from;
    }

    int end = from;
    int pos = 0;
    Uint8Array substring = convertArray(substringArray);

    for(int c0 = getChar(substring, 0); end < textByteArray.length();)
    {
      int c1 = getChar(textByteArray, end);

      if(
        (c0 != c1)
        && (match_case
        || (((c0 & 0xffff) != c0) || ((c1 & 0xffff) != c1)
        || (Character.toUpperCase((char)c0) != Character.toUpperCase(
          (char)c1)))))
      {
        return from;
      }

      pos   = nextChar(substring, pos);
      end   = nextChar(textByteArray, end);

      if(pos >= substring.length())
      {
        return end;
      }

      c0 = getChar(substring, pos);

      if(!isJavaIdentifier(c0))
      {
        c1 = getChar(textByteArray, end);

        if(isJavaIdentifier(c1))
        {
          return from;
        }

        if(isspace(c0))
        {
          pos = skipSpaces(substring, pos);

          if(pos >= substring.length())
          {
            return end;
          }

          c0 = getChar(substring, pos);

          if(isJavaIdentifier(c0))
          {
            end = nextJavaIdentifier(textByteArray, end);
          }
          else
          {
            end = skipSpaces(textByteArray, end);
          }
        }
      }
    }

    return from;
  }

  //~ Inner Classes ----------------------------------------------------------

  /**
   * Data structure representing document textual components. The text
   * structure is represented by a hierarchy of rectangular zones.
   */
  public static class Zone
    extends GRect
  {
    //~ Static fields/initializers -------------------------------------------

    // I think this indicates the version of text decoding being used.
    private static final int version = 1;

    //~ Instance fields ------------------------------------------------------

    /** List of children zone. */
    public JsArrayList<Zone> children = new JsArrayList<>();
    
    /**
     * Controls whether separators are added between lexical elements. This
     * is included to handle differences in languages. In English, for
     * example, words are separated by spaces and when searching, the spaces
     * are significant. In Japanese, there are no spaces and words may also
     * be broken between lines.  We would expect add_separators to be true
     * for English (default) and false for Japanese.
     */
    public boolean add_separators = true;

    /** Length of the zone text in substring textByteArray. */
    public int text_length = 0;

    /** Position of the zone text in substring textByteArray. */
    public int text_start = 0;

    /** Type of the zone. */
    public int   ztype       = DjVuText.PAGE;
    private Zone zone_parent = null;

    //~ Constructors ---------------------------------------------------------

    /**
     * Creates a new Zone object.
     */
    public Zone() {}

	public Zone(Zone toCopy) {
		super(toCopy);
		this.add_separators = toCopy.add_separators;
		this.text_length = toCopy.text_length;
		this.text_start = toCopy.text_start;
		this.ztype = toCopy.ztype;
		this.children = new JsArrayList<Zone>(toCopy.children);
		for (int i = 0; i < children.size(); i++) {
			Zone child = new Zone(cast(children.get(i)));
			child.zone_parent = this;
			children.set(i, child);
		}
	}

		private static native Zone cast(Object o) /*-{
			return o;
		}-*/;
	//~ Methods --------------------------------------------------------------

    /**
     * Appends another subzone inside this zone.  The new zone is initialized
     * with an empty rectangle, empty text, and has the same type as this
     * zone.
     *
     * @return DOCUMENT ME!
     */
    public Zone append_child()
    {
      final Zone empty = new Zone();
      empty.ztype            = ztype;
      empty.add_separators   = add_separators; // This level's value is the next's default
      empty.zone_parent      = this;
      children.add(empty);

      return empty;
    }

    /**
     * Find the zones used by the specified substring and append them to the
     * list.
     *
     * @param list vector to append zones to
     * @param start byte position to list
     * @param end byte position to list
     */
    public void append_zones(
      final ArrayList<Zone> list,
      final int    start,
      final int    end)
    {
      final int text_end = text_start + text_length;

      if(text_start >= start)
      {
        if(text_end <= end)
        {
          list.add(this);
        }
        else if(text_start < end)
        {
          if(children.size() > 0)
          {
            int pos = 0;

            do
            {
              children.get(pos++).append_zones(
                list,
                start,
                end);
            }
            while(children.size() > pos);
          }
          else
          {
            list.add(this);
          }
        }
      }
      else if(text_end > start)
      {
        for(int pos = 0; pos < children.size();)
        {
          children.get(pos++).append_zones(list, start, end);
        }
      }
    }

    /**
     * Find out this Zone's parent.
     *
     * @return the parent Zone
     */
    public Zone get_parent()
    {
      return zone_parent;
    }

    /**
     * Finds the smallest rectangles and appends them to the list.
     *
     * @param list vector to append zones to
     */
    public void get_smallest(final List<? super Zone> list)
    {
      if(children.size() > 0)
      {
    	  for (Zone child : children)
    		  child.get_smallest(list);
      }
      else
      {
        list.add(this);
      }
    }

    /**
     * Finds the smallest rectangles and appends them to the list after
     * padding the smallest unit to fit width or height for the parent
     * rectangle and adding the number of specified pixels.
     *
     * @param list vector to append zones to
     * @param padding number of pixels to expand each zone by
     */
    public void get_smallest(
      ArrayList<GRect>    list,
      final int padding)
    {
      if(children.size() > 0)
      {
        int pos = 0;

        do
        {
          children.get(pos++).get_smallest(list, padding);
        }
        while(children.size() > pos);
      }
      else if((zone_parent != null) && (zone_parent.ztype >= PARAGRAPH))
      {
        final GRect xrect = zone_parent;

        if(xrect.height() < xrect.width())
        {
          list.add(
            new GRect(
              xmin - padding,
              xrect.ymin - padding,
              width() + (2 * padding),
              xrect.height() + (2 * padding)));
        }
        else
        {
          list.add(
            new GRect(
              xrect.xmin - padding,
              ymin - padding,
              xrect.width() + (2 * padding),
              height() + (2 * padding)));
        }
      }
      else
      {
        list.add(
          new GRect(
            xmin - padding,
            ymin - padding,
            width() + (2 * padding),
            height() + (2 * padding)));
      }
    }

    /**
     * Find the text_start and text_end indicated by the given box.
     *
     * @param box DOCUMENT ME!
     * @param byteArray_start DOCUMENT ME!
     * @param byteArray_end DOCUMENT ME!
     */
    public void get_text_with_rect(
      final GRect box,
      NumContext  byteArray_start,
      NumContext  byteArray_end)
    {
      final boolean hasChildren = (children.size() > 0);

      if(hasChildren
        ? box.contains(this)
        : intersects_zone(box, this))
      {
        final int text_end = text_start + text_length;

        if(byteArray_start.intValue() == byteArray_end.intValue())
        {
          byteArray_start.set(text_start);
          byteArray_end.set(text_end);
        }
        else
        {
          if(byteArray_end.intValue() < text_end)
          {
            byteArray_end.set(text_end);
          }

          if(text_start < byteArray_start.intValue())
          {
            byteArray_start.set(text_start);
          }
        }
      }
      else if(hasChildren && intersects_zone(box, this))
      {
        int pos = 0;

        do
        {
          children.get(pos).get_text_with_rect(
            box,
            byteArray_start,
            byteArray_end);
        }
        while((++pos) < children.size());
      }
    }

    // Query if a zone and a rectangle intersect
    private static boolean intersects_zone(
      final GRect box,
      final GRect zone)
    {
      return ((box.xmin < zone.xmin)
      ? (box.xmax >= zone.xmin)
      : (box.xmin <= zone.xmax))
      && ((box.ymin < zone.ymin)
      ? (box.ymax >= zone.ymin)
      : (box.ymin <= zone.ymax));
    }

    // decode this zone from the text data
    private void decode(
      InputStream bs,
      int         maxtext,
      Zone        parent,
      Zone        prev)
      throws IOException
    {
      // Decode type
      ztype = bs.read();

      if((ztype < PAGE) || (ztype > CHARACTER))
      {
        throw new IOException("DjVuText.corrupt_text");
      }

      // Decode coordinates
      xmin = bs.read() << 8;
      xmin |= bs.read();
      xmin -= 0x8000;

      ymin = bs.read() << 8;
      ymin |= bs.read();
      ymin -= 0x8000;

      int width = bs.read() << 8;
      width |= bs.read();
      width -= 0x8000;

      int height = bs.read() << 8;
      height |= bs.read();
      height -= 0x8000;

      // Decode text info
      text_start = bs.read() << 8;
      text_start |= bs.read();
      text_start -= 0x8000;

      // int start=text_start;
      text_length = bs.read() << 16;
      text_length |= (bs.read() << 8);
      text_length |= bs.read();

      if(prev != null)
      {
        if((ztype == PAGE) || (ztype == PARAGRAPH) || (ztype == LINE))
        {
          xmin += prev.xmin;
          ymin = prev.ymin - (ymin + height);
        }
        else // Either COLUMN or WORD or CHARACTER
        {
          xmin += prev.xmax;
          ymin += prev.ymin;
        }

        text_start += (prev.text_start + prev.text_length);
      }
      else if(parent != null)
      {
        xmin += parent.xmin;
        ymin = parent.ymax - (ymin + height);
        text_start += parent.text_start;
      }

      xmax   = xmin + width;
      ymax   = ymin + height;

      // Get children size
      int size = bs.read() << 16;
      size |= (bs.read() << 8);
      size |= bs.read();

      // Checks
      if(
        isEmpty()
        || (text_start < 0)
        || ((text_start + text_length) > maxtext))
      {
        throw new IOException("DjVuText.corrupt_text");
      }

      // Process children
      Zone prev_child = null;
      children.clear();

      while(size-- > 0)
      {
        Zone z = append_child();
        z.decode(bs, maxtext, this, prev_child);
        prev_child = z;
      }
    }

    // decode this zone from the text data
    private void decode(
      InputStream bs,
      int         maxtext)
      throws IOException
    {
      decode(bs, maxtext, null, null);
    }
  }
}