#!/usr/bin/env python # -*- coding: utf-8 -*- # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab from __future__ import unicode_literals, division, absolute_import, print_function from .compatibility_utils import PY2, PY3, utf8_str, bstr, bchr if PY2: range = xrange array_format = b'B' if PY3: unichr = chr array_format = "B" import array import struct # note: struct pack, unpack, unpack_from all require bytestring format # data all the way up to at least python 2.7.5, python 3 okay with bytestring from .mobi_index import getVariableWidthValue, readTagSection, getTagMap from .mobi_utils import toHex DEBUG_DICT = False class InflectionData(object): def __init__(self, infldatas): self.infldatas = infldatas self.starts = [] self.counts = [] for idata in self.infldatas: start, = struct.unpack_from(b'>L', idata, 0x14) count, = struct.unpack_from(b'>L', idata, 0x18) self.starts.append(start) self.counts.append(count) def lookup(self, lookupvalue): i = 0 rvalue = lookupvalue while rvalue >= self.counts[i]: rvalue = rvalue - self.counts[i] i += 1 if i == len(self.counts): print("Error: Problem with multiple inflections data sections") return lookupvalue, self.starts[0], self.counts[0], self.infldatas[0] return rvalue, self.starts[i], self.counts[i], self.infldatas[i] def offsets(self, value): rvalue, start, count, data = self.lookup(value) offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue)) if rvalue + 1 < count: nextOffset, = struct.unpack_from(b'>H',data, start + 4 + (2 * (rvalue + 1))) else: nextOffset = None return offset, nextOffset, data class dictSupport(object): def __init__(self, mh, sect): self.mh = mh self.header = mh.header self.sect = sect self.metaOrthIndex = mh.metaOrthIndex self.metaInflIndex = mh.metaInflIndex def parseHeader(self, data): "read INDX header" if not data[:4] == b'INDX': print("Warning: index section is not INDX") return False words = ( 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code', 'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc' ) num = len(words) values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)]) header = {} for n in range(num): header[words[n]] = values[n] ordt1 = None ordt2 = None otype, oentries, op1, op2, otagx = struct.unpack_from(b'>LLLLL',data, 0xa4) header['otype'] = otype header['oentries'] = oentries if DEBUG_DICT: print("otype %d, oentries %d, op1 %d, op2 %d, otagx %d" % (otype, oentries, op1, op2, otagx)) if header['code'] == 0xfdea or oentries > 0: # some dictionaries seem to be codepage 65002 (0xFDEA) which seems # to be some sort of strange EBCDIC utf-8 or 16 encoded strings # So we need to look for them and store them away to process leading text # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries # we only ever seem to use the second but ... # # if otype = 0, ORDT table uses 16 bit values as offsets into the table # if otype = 1, ORDT table uses 8 bit values as offsets inot the table assert(data[op1:op1+4] == b'ORDT') assert(data[op2:op2+4] == b'ORDT') ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4) ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4) if DEBUG_DICT: print("parsed INDX header:") for key in header: print(key, "%x" % header[key],) print("\n") return header, ordt1, ordt2 def getPositionMap(self): sect = self.sect positionMap = {} metaOrthIndex = self.metaOrthIndex metaInflIndex = self.metaInflIndex decodeInflection = True if metaOrthIndex != 0xFFFFFFFF: print("Info: Document contains orthographic index, handle as dictionary") if metaInflIndex == 0xFFFFFFFF: decodeInflection = False else: metaInflIndexData = sect.loadSection(metaInflIndex) print("\nParsing metaInflIndexData") midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData) metaIndexCount = midxhdr['count'] idatas = [] for j in range(metaIndexCount): idatas.append(sect.loadSection(metaInflIndex + 1 + j)) dinfl = InflectionData(idatas) inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount) tagSectionStart = midxhdr['len'] inflectionControlByteCount, inflectionTagTable = readTagSection(tagSectionStart, metaInflIndexData) if DEBUG_DICT: print("inflectionTagTable: %s" % inflectionTagTable) if self.hasTag(inflectionTagTable, 0x07): print("Error: Dictionary uses obsolete inflection rule scheme which is not yet supported") decodeInflection = False data = sect.loadSection(metaOrthIndex) print("\nParsing metaOrthIndex") idxhdr, hordt1, hordt2 = self.parseHeader(data) tagSectionStart = idxhdr['len'] controlByteCount, tagTable = readTagSection(tagSectionStart, data) orthIndexCount = idxhdr['count'] print("orthIndexCount is", orthIndexCount) if DEBUG_DICT: print("orthTagTable: %s" % tagTable) if hordt2 is not None: print("orth entry uses ordt2 lookup table of type ", idxhdr['otype']) hasEntryLength = self.hasTag(tagTable, 0x02) if not hasEntryLength: print("Info: Index doesn't contain entry length tags") print("Read dictionary index data") for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount): data = sect.loadSection(i) hdrinfo, ordt1, ordt2 = self.parseHeader(data) idxtPos = hdrinfo['start'] entryCount = hdrinfo['count'] idxPositions = [] for j in range(entryCount): pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j)) idxPositions.append(pos) # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!) idxPositions.append(idxtPos) for j in range(entryCount): startPos = idxPositions[j] endPos = idxPositions[j+1] textLength = ord(data[startPos:startPos+1]) text = data[startPos+1:startPos+1+textLength] if hordt2 is not None: utext = u"" if idxhdr['otype'] == 0: pattern = b'>H' inc = 2 else: pattern = b'>B' inc = 1 pos = 0 while pos < textLength: off, = struct.unpack_from(pattern, text, pos) if off < len(hordt2): utext += unichr(hordt2[off]) else: utext += unichr(off) pos += inc text = utext.encode('utf-8') tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos) if 0x01 in tagMap: if decodeInflection and 0x2a in tagMap: inflectionGroups = self.getInflectionGroups(text, inflectionControlByteCount, inflectionTagTable, dinfl, inflNameData, tagMap[0x2a]) else: inflectionGroups = b'' assert len(tagMap[0x01]) == 1 entryStartPosition = tagMap[0x01][0] if hasEntryLength: # The idx:entry attribute "scriptable" must be present to create entry length tags. ml = b'<idx:entry scriptable="yes"><idx:orth value="' + text + b'">' + inflectionGroups + b'</idx:orth>' if entryStartPosition in positionMap: positionMap[entryStartPosition] = positionMap[entryStartPosition] + ml else: positionMap[entryStartPosition] = ml assert len(tagMap[0x02]) == 1 entryEndPosition = entryStartPosition + tagMap[0x02][0] if entryEndPosition in positionMap: positionMap[entryEndPosition] = b"</idx:entry>" + positionMap[entryEndPosition] else: positionMap[entryEndPosition] = b"</idx:entry>" else: indexTags = b'<idx:entry>\n<idx:orth value="' + text + b'">\n' + inflectionGroups + b'</idx:entry>\n' if entryStartPosition in positionMap: positionMap[entryStartPosition] = positionMap[entryStartPosition] + indexTags else: positionMap[entryStartPosition] = indexTags return positionMap def hasTag(self, tagTable, tag): ''' Test if tag table contains given tag. @param tagTable: The tag table. @param tag: The tag to search. @return: True if tag table contains given tag; False otherwise. ''' for currentTag, _, _, _ in tagTable: if currentTag == tag: return True return False def getInflectionGroups(self, mainEntry, controlByteCount, tagTable, dinfl, inflectionNames, groupList): ''' Create string which contains the inflection groups with inflection rules as mobipocket tags. @param mainEntry: The word to inflect. @param controlByteCount: The number of control bytes. @param tagTable: The tag table. @param data: The Inflection data object to properly select the right inflection data section to use @param inflectionNames: The inflection rule name data. @param groupList: The list of inflection groups to process. @return: String with inflection groups and rules or empty string if required tags are not available. ''' result = b"" for value in groupList: offset, nextOffset, data = dinfl.offsets(value) # First byte seems to be always 0x00 and must be skipped. assert ord(data[offset:offset+1]) == 0x00 tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset) # Make sure that the required tags are available. if 0x05 not in tagMap: print("Error: Required tag 0x05 not found in tagMap") return "" if 0x1a not in tagMap: print("Error: Required tag 0x1a not found in tagMap") return b'' result += b'<idx:infl>' for i in range(len(tagMap[0x05])): # Get name of inflection rule. value = tagMap[0x05][i] consumed, textLength = getVariableWidthValue(inflectionNames, value) inflectionName = inflectionNames[value+consumed:value+consumed+textLength] # Get and apply inflection rule across possibly multiple inflection data sections value = tagMap[0x1a][i] rvalue, start, count, data = dinfl.lookup(value) offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue)) textLength = ord(data[offset:offset+1]) inflection = self.applyInflectionRule(mainEntry, data, offset+1, offset+1+textLength) if inflection is not None: result += b' <idx:iform name="' + inflectionName + b'" value="' + inflection + b'"/>' result += b'</idx:infl>' return result def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end): ''' Apply inflection rule. @param mainEntry: The word to inflect. @param inflectionRuleData: The inflection rules. @param start: The start position of the inflection rule to use. @param end: The end position of the inflection rule to use. @return: The string with the inflected word or None if an error occurs. ''' mode = -1 byteArray = array.array(array_format, mainEntry) position = len(byteArray) for charOffset in range(start, end): char = inflectionRuleData[charOffset:charOffset+1] abyte = ord(char) if abyte >= 0x0a and abyte <= 0x13: # Move cursor backwards offset = abyte - 0x0a if mode not in [0x02, 0x03]: mode = 0x02 position = len(byteArray) position -= offset elif abyte > 0x13: if mode == -1: print("Error: Unexpected first byte %i of inflection rule" % abyte) return None elif position == -1: print("Error: Unexpected first byte %i of inflection rule" % abyte) return None else: if mode == 0x01: # Insert at word start byteArray.insert(position, abyte) position += 1 elif mode == 0x02: # Insert at word end byteArray.insert(position, abyte) elif mode == 0x03: # Delete at word end position -= 1 deleted = byteArray.pop(position) if bchr(deleted) != char: if DEBUG_DICT: print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted))) print("Error: Delete operation of inflection rule failed") return None elif mode == 0x04: # Delete at word start deleted = byteArray.pop(position) if bchr(deleted) != char: if DEBUG_DICT: print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted))) print("Error: Delete operation of inflection rule failed") return None else: print("Error: Inflection rule mode %x is not implemented" % mode) return None elif abyte == 0x01: # Insert at word start if mode not in [0x01, 0x04]: position = 0 mode = abyte elif abyte == 0x02: # Insert at word end if mode not in [0x02, 0x03]: position = len(byteArray) mode = abyte elif abyte == 0x03: # Delete at word end if mode not in [0x02, 0x03]: position = len(byteArray) mode = abyte elif abyte == 0x04: # Delete at word start if mode not in [0x01, 0x04]: position = 0 # Delete at word start mode = abyte else: print("Error: Inflection rule mode %x is not implemented" % abyte) return None return utf8_str(byteArray.tostring())