python source code of page

#    Paperwork - Using OCR to grep dead trees the easy way
#    Copyright (C) 2012-2014  Jerome Flesch
#    Copyright (C) 2012  Sebastien Maccagnoni-Munch
#
#    Paperwork is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    Paperwork is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with Paperwork.  If not, see <http://www.gnu.org/licenses/>.

"""
Code relative to page handling.
"""

import PIL.Image

import logging
import pyocr
import pyocr.builders

from ..common.page import BasicPage
from ..util import image2surface


logger = logging.getLogger(__name__)


class ImgPage(BasicPage):

    """
    Represents a page. A page is a sub-element of ImgDoc.
    """

    EXT_IMG = "jpg"

    KEYWORD_HIGHLIGHT = 3

    can_edit = True

    def __init__(self, doc, page_nb=None):
        if page_nb is None:
            page_nb = doc.nb_pages
        BasicPage.__init__(self, doc, page_nb)

    def __get_box_path(self):
        """
        Returns the file path of the box list corresponding to this page
        """
        return self._get_filepath(self.EXT_BOX)

    __box_path = property(__get_box_path)

    def __get_img_path(self):
        """
        Returns the file path of the image corresponding to this page
        """
        return self._get_filepath(self.EXT_IMG)

    def get_doc_file_path(self):
        """
        Returns the file path of the image corresponding to this page
        """
        return self.__get_img_path()

    __img_path = property(__get_img_path)

    def __get_last_mod(self):
        try:
            return self.fs.getmtime(self.__get_box_path())
        except OSError:
            return 0.0

    last_mod = property(__get_last_mod)

    def _get_text(self):
        """
        Get the text corresponding to this page
        """
        boxes = self.boxes
        txt = []
        for line in boxes:
            txt_line = u""
            for box in line.word_boxes:
                txt_line += u" " + box.content
            txt.append(txt_line)
        return txt

    def __get_boxes(self):
        """
        Get all the word boxes of this page.
        """
        boxfile = self.__box_path

        try:
            box_builder = pyocr.builders.LineBoxBuilder()
            with self.fs.open(boxfile, 'r') as file_desc:
                boxes = box_builder.read_file(file_desc)
            if boxes != []:
                return boxes
            # fallback: old format: word boxes
            # shouldn't be used anymore ...
            box_builder = pyocr.builders.WordBoxBuilder()
            with self.fs.open(boxfile, 'r') as file_desc:
                boxes = box_builder.read_file(file_desc)
            if len(boxes) <= 0:
                return []
            logger.warning("WARNING: Doc %s uses old box format" %
                           (str(self.doc)))
            return [pyocr.builders.LineBox(boxes, boxes[0].position)]
        except IOError as exc:
            logger.error("Unable to get boxes for '%s': %s"
                         % (self.doc.docid, exc))
            return []

    def __set_boxes(self, boxes):
        boxfile = self.__box_path
        with self.fs.open(boxfile, 'w') as file_desc:
            pyocr.builders.LineBoxBuilder().write_file(file_desc, boxes)

    boxes = property(__get_boxes, __set_boxes)

    def __get_img(self):
        """
        Returns an image object corresponding to the page
        """
        with self.fs.open(self.__img_path, 'rb') as fd:
            img = PIL.Image.open(fd)
            img.load()
            return img

    def __set_img(self, img):
        with self.fs.open(self.__img_path, 'wb') as fd:
            img.save(fd, format="JPEG")

    img = property(__get_img, __set_img)

    def get_image(self, size):
        img = self.img
        return img.resize(size, PIL.Image.ANTIALIAS)

    def __get_size(self):
        with self.fs.open(self.__img_path, 'rb') as fd:
            img = PIL.Image.open(fd)
            return img.size

    size = property(__get_size)

    def print_page_cb(self, print_op, print_context, keep_refs={}):
        """
        Called for printing operation by Gtk
        """
        ORIENTATION_PORTRAIT = 0
        ORIENTATION_LANDSCAPE = 1
        scaling = 2.0

        img = self.img
        (width, height) = img.size

        # take care of rotating the image if required
        if print_context.get_width() <= print_context.get_height():
            print_orientation = ORIENTATION_PORTRAIT
        else:
            print_orientation = ORIENTATION_LANDSCAPE
        if width <= height:
            img_orientation = ORIENTATION_PORTRAIT
        else:
            img_orientation = ORIENTATION_LANDSCAPE
        if print_orientation != img_orientation:
            logger.info("Rotating the page ...")
            img = img.rotate(90, expand=True)

        (width, height) = img.size

        # scale the image down
        # XXX(Jflesch): beware that we get floats for the page size ...
        scaling = min(
            print_context.get_width() / width,
            print_context.get_height() / height
        )

        logger.info("DPI: %fx%f" % (print_context.get_dpi_x(),
                                    print_context.get_dpi_y()))

        surface = image2surface(img)
        keep_refs['surface_cache_' + str(self.page_nb)] = surface

        # .. and print !
        cairo_context = print_context.get_cairo_context()
        cairo_context.scale(scaling, scaling)
        cairo_context.set_source_surface(surface, 0, 0)
        cairo_context.paint()

    def change_index(self, offset=0):
        """
        Move the page number by a given offset. Beware to not let any hole
        in the page numbers when doing this. Make sure also that the wanted
        number is available.
        Will also change the page number of the current object.
        """
        src = {}
        src["box"] = self.__get_box_path()
        src["img"] = self.__get_img_path()
        src["thumb"] = self._get_thumb_path()

        page_nb = self.page_nb

        page_nb += offset

        logger.info("--> Moving page %d (+%d) to index %d"
                    % (self.page_nb, offset, page_nb))

        self.page_nb = page_nb

        dst = {}
        dst["box"] = self.__get_box_path()
        dst["img"] = self.__get_img_path()
        dst["thumb"] = self._get_thumb_path()

        for key in src.keys():
            if self.fs.exists(src[key]):
                if self.fs.exists(dst[key]):
                    logger.error("Error: file already exists: %s" % dst[key])
                    assert(0)
                self.fs.rename(src[key], dst[key])

    def destroy(self):
        """
        Delete the page. May delete the whole document if it's actually the
        last page.
        """
        logger.info("Destroying page: %s" % self)
        if self.doc.nb_pages <= 1:
            self.doc.destroy()
            return
        doc_pages = self.doc.pages[:]
        current_doc_nb_pages = self.doc.nb_pages
        paths = [
            self.__get_box_path(),
            self.__get_img_path(),
            self._get_thumb_path(),
        ]
        for path in paths:
            if self.fs.exists(path):
                self.fs.unlink(path)
        for page_nb in range(self.page_nb + 1, current_doc_nb_pages):
            page = doc_pages[page_nb]
            page.change_index(offset=-1)

    def _steal_content(self, other_page):
        """
        Call ImgDoc.steal_page() instead
        """
        other_doc = other_page.doc
        other_doc_pages = other_doc.pages[:]
        other_doc_nb_pages = other_doc.nb_pages
        other_page_nb = other_page.page_nb

        to_move = [
            (other_page.__get_box_path(), self.__get_box_path()),
            (other_page.__get_img_path(), self.__get_img_path()),
            (other_page._get_thumb_path(), self._get_thumb_path())
        ]
        for (src, dst) in to_move:
            # sanity check
            if self.fs.exists(dst):
                logger.error("Error, file already exists: %s" % dst)
                assert(0)
        for (src, dst) in to_move:
            logger.info("%s --> %s" % (src, dst))
            self.fs.rename(src, dst)

        if (other_doc_nb_pages <= 1):
            other_doc.destroy()
        else:
            for page_nb in range(other_page_nb + 1, other_doc_nb_pages):
                page = other_doc_pages[page_nb]
                page.change_index(offset=-1)

    def get_docfilehash(self):
        return self.doc.hash_file(self.fs, self.__get_img_path())

    def has_ocr(self):
        # always act as if images have OCR file attached
        return True