# Parse the Spotlight store.db file from mac OSX # # (c) Yogesh Khatri - 2018 www.swiftforensics.com # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You can get a copy of the complete license here: # <http://www.gnu.org/licenses/>. # # Script Name : spotlight_parser.py # Author : Yogesh Khatri # Last Updated : 04/01/2019 # Requirement : Python 3.7, modules ( lz4, enum34 ) and lzfse library # Dependencies can be installed using the command 'pip install lz4 enum34' # You will also need to install the lzfse decompression library. Follow the # instructions here (https://github.com/ydkhatri/mac_apt/tree/master/Libraries_For_Windows) # to install lzfse.pyd. This is dependent on your version of python and platform. # # Purpose : Parse the Spotlight store.db or .store.db file from mac OSX # These files are located under: # /.Spotlight-V100/Store-V2/<UUID>/ # # Since macOS 10.13, there are also spotlight databases for each user under # ~/Library/Metadata/CoreSpotlight/index.spotlightV3/ # # iOS Spotlight databases are found at location # /private/var/mobile/Library/Spotlight/CoreSpotlight/***/index.spotlightV2 # where *** is one of NSFileProtectionComplete, NSFileProtectionCompleteUnlessOpen or # NSFileProtectionCompleteUntilFirstUserAuthentication. For iOS databases, you # will need to have the files that begin with 'dbStr' (which are available # in the same folder as store.db. These files are specific to that instance # of store.db. Ideally, just extract the whole folder instead of just the single # store.db file. # # Usage : spotlight_parser.py [-p OUTPUT_PREFIX] <path_to_database> <output_folder> # Example: python.exe spotlight_parser.py c:\store.db c:\store_output # # Ack : M Bartle for most of the python3 porting # # Feedback : Send bugs and feedback to yogesh@swiftforensics.com # import zlib import lz4.block import time import struct import binascii import datetime import os import sys import logging from enum import IntEnum lzfse_capable = False try: import lzfse lzfse_capable = True except ImportError: print("lzfse not found. Won't decompress lzfse/lzvn streams") __VERSION__ = '0.9.1' log = logging.getLogger('SPOTLIGHT_PARSER') class FileMetaDataListing: def __init__(self, file_pos, data, size): self.file_pos = file_pos self.pos = 0 self.data = data self.size = size self.meta_data_dict = {} # { kMDItemxxx: value1, kMCItemyyy: value2, ..} # self.id = 0 # inode number self.flags = 0 self.item_id = 0 self.parent_id = 0 # inode for parent folder self.date_updated = None self.full_path = '' def ReadFloat(self): num = struct.unpack("<f", self.data[self.pos : self.pos + 4])[0] self.pos += 4 return num def ReadDouble(self): num = struct.unpack("<d", self.data[self.pos : self.pos + 8])[0] self.pos += 8 return num def ReadDate(self): '''Returns date as string''' # Date stored as 8 byte double, it is mac absolute time (2001 epoch) mac_abs_time = self.ReadDouble() if mac_abs_time > 0: # Sometimes, a very large number that needs to be reinterpreted as signed int old = mac_abs_time mac_abs_time = struct.unpack("<q", struct.pack("<Q", int(mac_abs_time)) )[0] # double to signed int64 if int(old) == mac_abs_time: # int(536198400.512156) == 536198400 = True mac_abs_time = old # preserve extra precision after decimal point try: return datetime.datetime(2001,1,1) + datetime.timedelta(seconds = mac_abs_time) except (ValueError, OverflowError, struct.error): pass return "" def ConvertEpochToUtcDateStr(self, value): '''Convert Epoch microseconds timestamp to string''' try: return datetime.datetime(1970, 1, 1) + datetime.timedelta(seconds=value/1000000.) except OverflowError: pass return "" def ReadVarSizeNum(self): '''Returns num and bytes_read''' num, bytes_read = SpotlightStore.ReadVarSizeNum(self.data[self.pos : min(self.size, 9 + self.size)]) self.pos += bytes_read return num, bytes_read def ReadStr(self, dont_decode=False): '''Returns single string of data and bytes_read''' size, pos = self.ReadVarSizeNum() string = self.data[self.pos:self.pos + size] if string[-1] == 0: string = string[:-1] # null character if string.endswith(b'\x16\x02'): string = string[:-2] self.pos += size if dont_decode: return string, size + pos return string.decode('utf8', "backslashreplace"), size + pos def ReadStrings(self, dont_decode=False): '''Returns array of strings found in data and bytes_read''' size, pos = self.ReadVarSizeNum() all_strings_in_one = self.data[self.pos:self.pos+size] strings = [x for x in all_strings_in_one.split(b'\x00') if x != b''] if dont_decode: strings = [x[:-2] if x.endswith(b'\x16\x02') else x for x in strings] else: strings = [x[:-2].decode('utf8', "backslashreplace") if x.endswith(b'\x16\x02') else x.decode('utf8', "backslashreplace") for x in strings] self.pos += size return strings, size + pos def ReadSingleByte(self): single = struct.unpack("<B", self.data[self.pos : self.pos + 1])[0] self.pos += 1 return single def ReadManyBytes(self, count, debug_dont_advance = False): '''Returns tuple, does not increment file pointer''' many = struct.unpack("<" + str(count) + "B", self.data[self.pos : self.pos + count]) if debug_dont_advance: return many self.pos += count return many # No usages def ReadManyBytesReturnHexString(self, count, debug_dont_advance = False): '''does not increment file pointer''' many = self.ReadManyBytes(count, debug_dont_advance) ret = ''.join('{:02X}'.format(x) for x in many) return ret def GetFileName(self): if self.meta_data_dict.get('_kStoreMetadataVersion', None) != None: # plist, not metadata return '------PLIST------' name = self.meta_data_dict.get('_kMDItemFileName', None) if name == None: name = self.meta_data_dict.get('kMDItemDisplayName') if name: if type(name) == list: name = name[0] if '\x16\x02' in name: name = name.split('\x16\x02')[0] else: name = '------NONAME------' return name def StringifyValue(self, v): if type(v) == list: if v: if len(v) == 1: v = v[0] else: if type(v[0]) != str: v = ', '.join([str(x) for x in v]) else: v = ', '.join(v) else: v = '' if type(v) not in (bytes, str): v = str(v) if type(v) == bytes: v = v.decode('utf-8', 'backslashreplace') return v def Print(self, file): try: dashed_line = "-"*60 info = "Inode_Num --> {}\r\nFlags --> {}\r\nStore_ID --> {}\r\nParent_Inode_Num --> {}\r\nLast_Updated --> {}\r\n".format(self.id, self.flags, self.item_id, self.parent_id, self.ConvertEpochToUtcDateStr(self.date_updated)) file.write((dashed_line + '\r\n' + info).encode('utf-8', 'backslashreplace')) for k, v in sorted(self.meta_data_dict.items()): orig_debug = v v = self.StringifyValue(v) file.write((k + " --> " + v).encode('utf-8', 'backslashreplace')) file.write(b'\r\n') except Exception as ex: log.exception("Exception trying to print data : ") def ConvertUint64ToSigned(self, unsigned_num): '''Return signed version of number, Eg: 0xFFFFFFFFFFFFFFFF will return -1''' return struct.unpack("<q", struct.pack("<Q", unsigned_num))[0] def ConvertUint32ToSigned(self, unsigned_num): '''Return signed version of number, Eg: 0xFFFFFFFF will return -1''' return struct.unpack("<i", struct.pack("<I", unsigned_num))[0] def ParseItem(self, properties, categories, indexes_1, indexes_2): self.id = self.ConvertUint64ToSigned(self.ReadVarSizeNum()[0]) self.flags = self.ReadSingleByte() self.item_id = self. ConvertUint64ToSigned(self.ReadVarSizeNum()[0]) self.parent_id = self.ConvertUint64ToSigned(self.ReadVarSizeNum()[0]) self.date_updated = self.ReadVarSizeNum()[0] ## type = bytes used # 00 = byte or varNum ? bool? # 02 = byte or varNum ? # 06 = byte or varNum ? # 07 = varNum # 08 = ? # 09 = float (4 bytes) # 0a = double (8 bytes) # 0b = var (len+data) # 0c = double (8 bytes) --> mac_abs_time # 0e = var (len+data) # 0f = varNum? prop_index = 0 last_prop = None # for debug only last_filepos = 0 # for debug only filepos = None prop = None while self.pos < self.size: last_filepos = filepos filepos = hex(self.file_pos + 0 + self.pos) prop_skip_index = self.ReadVarSizeNum()[0] if prop_skip_index == 0: log.warning("Something went wrong, skip index was 0 @ {}".format(filepos)) prop_index += prop_skip_index last_prop = prop # for debug only prop = properties.get(prop_index, None) if prop == None: log.error("Error, cannot proceed, invalid property index {}, skip={}".format(prop_index, prop_skip_index)) return else: prop_name = prop[0] prop_type = prop[1] value_type = prop[2] value = '' if value_type == 0: value = self.ReadVarSizeNum()[0] elif value_type == 2: value = self.ReadVarSizeNum()[0] elif value_type == 6: value = self.ReadVarSizeNum()[0] elif value_type == 7: log.debug("Found value_type 7, prop_type=0x{:X} prop={} @ {}, pos 0x{:X}".format(prop_type, prop_name, filepos, self.pos)) if prop_type & 2 == 2: # == 0x0A: number = self.ReadVarSizeNum()[0] num_values = number >> 3 value = [self.ReadVarSizeNum()[0] for x in range(num_values)] discarded_bits = number & 0x07 if discarded_bits != 0: log.info('Discarded bits value was 0x{:X}'.format(discarded_bits)) else: # 0x48 (_kMDItemDataOwnerType, _ICItemSearchResultType, kMDItemRankingHint, FPCapabilities) # 0x4C (_kMDItemStorageSize, _kMDItemApplicationImporterVersion) # 0x0a (_kMDItemOutgoingCounts, _kMDItemIncomingCounts) firstbyte = 0x20 , then 4 bytes value = self.ReadVarSizeNum()[0] #if prop_type == 0x48: # Can perhaps be resolved to a category? Need to check. # print("") elif value_type == 8 and prop_name != 'kMDStoreAccumulatedSizes': if prop_type & 2 == 2: num_values = (self.ReadVarSizeNum()[0]) singles = [self.ReadSingleByte() for x in range(num_values)] value = singles else: value = self.ReadSingleByte() elif value_type == 9: if prop_type & 2 == 2: num_values = (self.ReadVarSizeNum()[0])//4 floats = [self.ReadFloat() for x in range(num_values)] value = floats else: value = self.ReadFloat() elif value_type == 0x0A: if prop_type & 2 == 2: num_values = (self.ReadVarSizeNum()[0])//8 doubles = [self.ReadDouble() for x in range(num_values)] value = doubles else: value = self.ReadDouble() elif value_type == 0x0B: value = self.ReadStrings()[0] if prop_type & 2 != 2: if len(value) == 0: value = '' elif len(value) == 1: value = value[0] else: log.warning('String was multivalue without multivalue bit set') elif value_type == 0x0C: if prop_type & 2 == 2: num_dates = (self.ReadVarSizeNum()[0])//8 dates = [] for x in range(num_dates): dates.append(self.ReadDate()) value = dates else: value = self.ReadDate() elif value_type == 0x0E: if prop_type & 2 == 2: value = self.ReadStrings(dont_decode=True if prop_name != 'kMDStoreProperties' else False)[0] else: value = self.ReadStr(dont_decode=True if prop_name != 'kMDStoreProperties' else False)[0] if prop_name != 'kMDStoreProperties': if type(value) == list: if len(value) == 1: value = binascii.hexlify(value[0]).decode('ascii').upper() else: value = [binascii.hexlify(item).decode('ascii').upper() for item in value] else: # single string value = binascii.hexlify(value).decode('ascii').upper() elif value_type == 0x0F: value = self.ConvertUint32ToSigned(self.ReadVarSizeNum()[0]) if value < 0: value = 'INVALID ({})'.format(value) else: old_value = value if prop_type & 3 == 3: # in (0x83, 0xC3, 0x03): # ItemKind value = indexes_2.get(value, None) if value == None: value = 'error getting index_2 for value {}'.format(old_value) else: for v in value: if v < 0: continue cat = categories.get(v, None) if cat == None: log.error('error getting category for index={} prop_type={} prop_name={}'.format(v, prop_type, prop_name)) else: all_translations = cat.split(b'\x16\x02') if len(all_translations) > 2: log.warning('Encountered more than one control sequence in single translation' 'string.') log.debug('Found this list: {}', other) value = all_translations[0].decode('utf8', 'backslashreplace') break # only get first, rest are language variants! elif prop_type & 0x2 == 0x2: #== 0x4A: # ContentTypeTree ItemUserTags value = indexes_1.get(value, None) if value == None: value = 'error getting index_1 for value {}'.format(old_value) else: tree = [] for v in value: if v < 0: continue cat = categories.get(v, None) if cat == None: log.error('error getting category for index={} prop_type={} prop_name={}'.format(v, prop_type, prop_name)) else: tree.append(cat.decode('utf8', 'backslashreplace')) value = tree else: #elif prop_type & 8 == 8: #== 0x48: # ContentType if value >= 0: cat = categories.get(value, None) if cat == None: log.error('error getting category for index={} prop_type={} prop_name={}'.format(v, prop_type, prop_name)) else: value = cat value = value.decode('utf8', 'backslashreplace') else: value = '' #else: # log.info("Not seen before value-type 0x0F item, prop_type={:X}, prop={}".format(prop_type, prop_name)) else: if prop_name != 'kMDStoreAccumulatedSizes': log.info("Pos={}, Unknown value_type {}, PROPERTY={}, PROP_TYPE={} ..RETURNING!".format(filepos, value_type, prop_name, prop_type)) return if prop_name in self.meta_data_dict: log.warning('Spotlight property {} had more than one entry for inode {}'.format(prop_name, self.id)) self.meta_data_dict[prop_name] = value class BlockType(IntEnum): UNKNOWN_0 = 0 METADATA = 0x09 PROPERTY = 0x11 CATEGORY = 0x21 UNKNOWN_41 = 0x41 INDEX = 0x81 def __str__(self): return self.name class StoreBlock0: def __init__(self, data): self.data = data self.signature = struct.unpack("<I", data[0:4])[0] if self.signature not in [0x64626D31, 0x64626D32]: # 1mbd or 2mbd (block 0) raise Exception("Unknown signature {:X} in block0! Can't parse".format(self.signature)) self.physical_size = struct.unpack("<I", data[4:8])[0] self.item_count = struct.unpack("<I", data[8:12])[0] self.unk_zero = struct.unpack("<I", data[12:16])[0] self.unk_type = struct.unpack("<I", data[16:20])[0] # Followed by indexes [last_id_in_block, offset_index, dest_block_size] # If sig==1mbd, then last_id_in_block is BigEndian else LE # Everything else LE self.indexes = [] pos = 20 for i in range (0, self.item_count): index = struct.unpack("<QII", data[pos : pos + 16]) # last_id_in_block is not used, so we don't care if it is read BE/LE self.indexes.append(index) pos += 16 class StoreBlock: def __init__(self, data): self.data = data self.pos = 0 self.signature = struct.unpack("<I", data[0:4])[0] if self.signature != 0x64627032: # 2pbd (most blocks) raise Exception("Unknown signature {:X} in block! Can't parse".format(self.signature)) self.physical_size = struct.unpack("<I", data[4:8])[0] self.logical_size = struct.unpack("<I", data[8:12])[0] self.block_type = struct.unpack("<I", data[12:16])[0] # self.unknown = struct.unpack("<I", data[16:20])[0] # usually zero or size of uncompressed data self.next_block_index = struct.unpack("<I", data[20:24])[0] self.unknown1 = struct.unpack("<I", data[24:28])[0] self.unknown2 = struct.unpack("<I", data[28:32])[0] class DbStrMapHeader: def __init__(self): self.sig = None self.unk1 = 0 self.unk2 = 0 self.unk3 = 0 self.next_free_location_in_map_data = 0 self.unk5 = 0 self.next_data_id_number = 0 self.unk7 = 0 self.unk8 = 0 self.unk9 = 0 self.num_deleted_entries = 0 self.unk10 = 0 self.unk11 = 0 def Parse(self, data): self.sig, self.unk1, self.unk2, self.unk3, self.next_free_location_in_map_data, \ self.unk5, self.next_data_id_number, self.unk7, self.unk8, self.unk9, \ self.num_deleted_entries, self.unk11, self.unk12 = struct.unpack("<Q12I", data[0:56]) if self.sig != 0x0000446174615000: log.warning("Header signature is different for DbStrMapHeader. Sig=0x{:X}".format(self.sig)) class SpotlightStore: def __init__(self, file_pointer): self.file = file_pointer #self.pos = 0 if not self.IsValidStore(): raise Exception('Not a version 2 Spotlight store.db file, invalid format!') self.file.seek(0) self.header = self.file.read(0x1000) self.flags = struct.unpack("<I", self.header[4:8])[0] self.header_unknowns = struct.unpack("6I", self.header[12:36]) self.header_size = self.ReadUint(self.header[36:40]) self.block0_size = self.ReadUint(self.header[40:44]) self.block_size = self.ReadUint(self.header[44:48]) self.index_blocktype_11 = self.ReadUint(self.header[48:52]) self.index_blocktype_21 = self.ReadUint(self.header[52:56]) self.index_blocktype_41 = self.ReadUint(self.header[56:60]) self.index_blocktype_81_1 = self.ReadUint(self.header[60:64]) self.index_blocktype_81_2 = self.ReadUint(self.header[64:68]) self.original_path = self.header[0x144:0x244].decode('utf-8').rstrip('\0') # 256 bytes self.file_size = self.GetFileSize(self.file) self.properties = {} self.categories = {} self.indexes_1 = {} self.indexes_2 = {} self.block0 = None def GetFileSize(self, file): '''Return size from an open file handle''' current_pos = file.tell() file.seek(0, 2) # Seek to end size = file.tell() file.seek(current_pos) # back to original position return size def IsValidStore(self): self.file.seek(0) signature = self.file.read(4) if signature == b'\x38\x74\x73\x64': # 8tsd return True return False def Seek(self, pos): self.pos = pos self.file.seek(pos) def ReadFromFile(self, size): data = self.file.read(size) self.pos += len(data) return data def ReadUint(self, data): return struct.unpack("<I", data)[0] def ReadUint64(self, data): return struct.unpack("<Q", data)[0] @staticmethod def ReadIndexVarSizeNum(data): '''Returns num and bytes_read''' byte = struct.unpack("B", data[0:1])[0] num_bytes_read = 1 ret = byte & 0x7F # remove top bit while (byte & 0x80) == 0x80: # highest bit set, need to read one more byte = struct.unpack("B", data[num_bytes_read:num_bytes_read + 1])[0] ret |= (byte & 0x7F) << (7 * num_bytes_read) num_bytes_read += 1 return ret, num_bytes_read @staticmethod def ReadVarSizeNum(data): '''Returns num and bytes_read''' first_byte = struct.unpack("B", data[0:1])[0] extra = 0 use_lower_nibble = True if first_byte == 0: return 0, 1 elif (first_byte & 0xF0) == 0xF0: # 4 or more use_lower_nibble = False if (first_byte & 0x0F)==0x0F: extra = 8 elif (first_byte & 0x0E)==0x0E: extra = 7 elif (first_byte & 0x0C)==0x0C: extra = 6 elif (first_byte & 0x08)==0x08: extra = 5 else: extra = 4 use_lower_nibble = True first_byte -= 0xF0 elif (first_byte & 0xE0) == 0xE0: extra = 3 first_byte -= 0xE0 elif (first_byte & 0xC0) == 0xC0: extra = 2 first_byte -=0xC0 elif (first_byte & 0x80) == 0x80: extra = 1 first_byte -= 0x80 if extra: num = 0 num += sum(struct.unpack('B', data[x:x+1])[0] << (extra - x) * 8 for x in range(1, extra + 1)) if use_lower_nibble: num = num + (first_byte << (extra*8)) return num, extra + 1 return first_byte, extra + 1 def ReadOffsets(self, offsets_content): ''' Read offsets and index information from dbStr-x.map.offsets file data. Returns list of lists [ [index, offset], [index, offset], .. ] ''' offsets_len = len(offsets_content) pos = 4 index = 1 offsets = [] # [ [index, offset], [index, offset], ..] while pos < offsets_len: off = struct.unpack("<I", offsets_content[pos:pos + 4])[0] if off == 0: break elif off != 1: # 1 is invalid (deleted) offsets.append([index, off]) index += 1 pos += 4 return offsets def ParsePropertiesFromFileData(self, data_content, offsets_content, header_content): data_len = len(data_content) header_len = len(header_content) header = DbStrMapHeader() header.Parse(header_content) # Parse offsets file offsets = self.ReadOffsets(offsets_content) # Parse data file data_version = struct.unpack("<H", data_content[0:2]) for index, offset in offsets: entry_size, bytes_moved = SpotlightStore.ReadVarSizeNum(data_content[offset:]) value_type, prop_type = struct.unpack("<BB", data_content[offset + bytes_moved : offset + bytes_moved + 2]) name = data_content[offset + bytes_moved + 2:offset + bytes_moved + entry_size].split(b'\x00')[0] self.properties[index] = [name.decode('utf-8', 'backslashreplace'), prop_type, value_type] def ParseProperties(self, block): data = block.data pos = 32 size = block.logical_size while pos < size: index, value_type, prop_type = struct.unpack("<IBB", data[pos : pos+6]) pos += 6 name = data[pos:pos+size].split(b'\x00')[0] pos += len(name) + 1 if len(name) < size else size self.properties[index] = [name.decode('utf-8', 'backslashreplace'), prop_type, value_type] def ParseCategoriesFromFileData(self, data_content, offsets_content, header_content): data_len = len(data_content) header_len = len(header_content) header = DbStrMapHeader() header.Parse(header_content) # Parse offsets file offsets = self.ReadOffsets(offsets_content) # Parse data file data_version = struct.unpack("<H", data_content[0:2]) for index, offset in offsets: entry_size, bytes_moved = SpotlightStore.ReadVarSizeNum(data_content[offset:]) name = data_content[offset + bytes_moved:offset + bytes_moved + entry_size].split(b'\x00')[0] self.categories[index] = name def ParseCategories(self, block): data = block.data pos = 32 size = block.logical_size while pos < size: index = struct.unpack("<I", data[pos : pos+4])[0] pos += 4 name = data[pos:pos+size].split(b'\x00')[0] pos += len(name) + 1 if len(name) < size else size # sanity check temp = self.categories.get(index, None) if temp != None: log.error("Error, category {} already exists!!".format(temp)) # end check self.categories[index] = name def ParseIndexesFromFileData(self, data_content, offsets_content, header_content, dictionary, has_extra_byte=False): data_len = len(data_content) header_len = len(header_content) header = DbStrMapHeader() header.Parse(header_content) # Parse offsets file offsets = self.ReadOffsets(offsets_content) # Parse data file data_version = struct.unpack("<H", data_content[0:2]) pos = 0 for index, offset in offsets: pos = offset entry_size, bytes_moved = SpotlightStore.ReadIndexVarSizeNum(data_content[pos:]) pos += bytes_moved index_size, bytes_moved = SpotlightStore.ReadVarSizeNum(data_content[pos:]) pos += bytes_moved if entry_size - index_size > 2: log.debug("ReadIndexVarSizeNum() read the number incorrectly?") else: log.debug("index={}, offset={}, entry_size=0x{:X}, index_size=0x{:X}".format(index, offset, entry_size, index_size)) if has_extra_byte: pos += 1 index_size = 4*int(index_size//4) ids = struct.unpack("<" + str(index_size//4) + "i", data_content[pos:pos + index_size]) # sanity check temp = dictionary.get(index, None) if temp != None: log.error("Error, category {} already exists!!".format(temp)) # end check dictionary[index] = ids def ParseIndexes(self, block, dictionary): data = block.data pos = 32 size = block.logical_size while pos < size: index = struct.unpack("<I", data[pos : pos+4])[0] pos += 4 index_size, bytes_moved = SpotlightStore.ReadVarSizeNum(data[pos:]) pos += bytes_moved padding = index_size % 4 pos += padding index_size = 4*int(index_size//4) ids = struct.unpack("<" + str(index_size//4) + "i", data[pos:pos + index_size]) pos += index_size # sanity check temp = dictionary.get(index, None) if temp != None: log.error("Error, category {} already exists!!".format(temp)) # end check dictionary[index] = ids def ProcessBlock(self, block, dictionary): if block.block_type == BlockType.UNKNOWN_0: pass elif block.block_type == BlockType.METADATA: pass elif block.block_type == BlockType.PROPERTY: self.ParseProperties(block) elif block.block_type == BlockType.CATEGORY: self.ParseCategories(block) elif block.block_type == BlockType.UNKNOWN_41: pass elif block.block_type == BlockType.INDEX: self.ParseIndexes(block, dictionary) else: log.info ('Unknown block type encountered: 0x{:.2X}'.format(block.block_type)) def ItemExistsInDictionary(self, items_to_compare, md_item): '''Check if md_item exists in the dictionary''' # items_to_compare[id] = [id, parent_id, name, full_path, date] hit = items_to_compare.get(md_item.id, None) if hit and (hit[4] == md_item.date_updated): return True return False def ParseMetadataBlocks(self, output_file, items, items_to_compare=None, process_items_func=None): # Index = [last_id_in_block, offset_index, dest_block_size] for index in self.block0.indexes: #go to offset and parse self.Seek(index[1] * 0x1000) block_data = self.ReadFromFile(self.block_size) compressed_block = StoreBlock(block_data) if compressed_block.block_type & 0xFF != BlockType.METADATA: raise Exception('Expected METADATA block, Unknown block type encountered: 0x{:X}'.format(compressed_block.block_type)) log.debug ("Trying to decompress compressed block @ 0x{:X}".format(index[1] * 0x1000 + 20)) try: if compressed_block.block_type & 0x1000 == 0x1000: # LZ4 compression if block_data[20:24] in [b'bv41', b'bv4-']: # check for bv41, version 97 in High Sierra has this header (bv41) and footer (bv4$) # There are often multiple chunks bv41.....bv41.....bv41.....bv4$ # Sometimes bv4- (uncompressed data) followed by 4 bytes length, then data chunk_start = 20 # bv41 offset uncompressed = b'' last_uncompressed = b'' header = block_data[chunk_start:chunk_start + 4] while (self.block_size > chunk_start) and (header != b'bv4$'): # b'bv41': log.debug("0x{:X} - {}".format(chunk_start, header)) if header == b'bv41': uncompressed_size, compressed_size = struct.unpack('<II', block_data[chunk_start + 4:chunk_start + 12]) last_uncompressed = lz4.block.decompress(block_data[chunk_start + 12: chunk_start + 12 + compressed_size], uncompressed_size, dict=last_uncompressed) chunk_start += 12 + compressed_size uncompressed += last_uncompressed elif header == b'bv4-': uncompressed_size = struct.unpack('<I', block_data[chunk_start + 4:chunk_start + 8])[0] uncompressed += block_data[chunk_start + 8:chunk_start + 8 + uncompressed_size] chunk_start += 8 + uncompressed_size else: log.warning('Unknown compression value @ 0x{:X} - {}'.format(chunk_start, header)) header = block_data[chunk_start:chunk_start + 4] else: uncompressed = lz4.block.decompress(block_data[20:compressed_block.logical_size], compressed_block.unknown - 20) elif compressed_block.block_type & 0x2000 == 0x2000: # LZFSE compression seen, also perhaps LZVN if not lzfse_capable: log.error('LZFSE library not available for LZFSE decompression, skipping block..') continue if block_data[20:23] == b'bvx': # check for header (bvx1 or bvx2 or bvxn) and footer (bvx$) chunk_start = 20 # bvx offset uncompressed = b'' header = block_data[chunk_start:chunk_start + 4] log.debug("0x{:X} - {}".format(chunk_start, header)) if header in [b'bvx1', b'bvx2', b'bvxn']: uncompressed_size = struct.unpack('<I', block_data[chunk_start + 4:chunk_start + 8])[0] uncompressed = lzfse.decompress(block_data[chunk_start : compressed_block.logical_size]) if len(uncompressed) != uncompressed_size: log.error('Decompressed size does not match stored value, DecompSize={}, Should_be={}'.format(len(uncompressed), uncompressed_size)) elif header == b'bvx-': uncompressed_size = struct.unpack('<I', block_data[chunk_start + 4:chunk_start + 8])[0] uncompressed = block_data[chunk_start + 8:chunk_start + 8 + uncompressed_size] else: log.warning('Unknown compression value @ 0x{:X} - {}'.format(chunk_start, header)) else: uncompressed = lz4.block.decompress(block_data[20:compressed_block.logical_size], compressed_block.unknown - 20) else: # zlib compression #compressed_size = compressed_block.logical_size - 20 uncompressed = zlib.decompress(block_data[20:compressed_block.logical_size]) except Exception as ex: log.error("Decompression error for block @ 0x{:X}\r\n{}".format(index[1] * 0x1000 + 20, str(ex))) if len(uncompressed) == 0: continue ## Now process it!! items_in_block = [] pos = 0 count = 0 meta_size = len(uncompressed) while (pos < meta_size): item_size = struct.unpack("<I", uncompressed[pos:pos+4])[0] md_item = FileMetaDataListing(pos + 4, uncompressed[pos + 4 : pos + 4 + item_size], item_size) try: md_item.ParseItem(self.properties, self.categories, self.indexes_1, self.indexes_2) if items_to_compare and self.ItemExistsInDictionary(items_to_compare, md_item): pass # if md_item exists in compare_dict, skip it, else add else: items_in_block.append(md_item) name = md_item.GetFileName() existing_item = items.get(md_item.id, None) if existing_item != None: log.warning('Item already present id={}, name={}, existing_name={}'.format(md_item.id, name, existing_item[2])) if existing_item[1] != md_item.parent_id: log.warning("Repeat item has different parent_id, existing={}, new={}".format(existing_item[1], md_item.parent_id)) if name != '------NONAME------': # got a real name if existing_item[2] == '------NONAME------': existing_item[2] = name else: # has a valid name if existing_item[2] != name: log.warning("Repeat item has different name, existing={}, new={}".format(existing_item[2], name)) else: # Not adding repeat elements items[md_item.id] = [md_item.id, md_item.parent_id, md_item.GetFileName(), None, md_item.date_updated] # id, parent_id, name, path, date except: log.exception('Error trying to process item @ block {:X} offset {}'.format(index[1] * 0x1000 + 20, pos)) pos += item_size + 4 count += 1 if process_items_func: process_items_func(items_in_block) for md_item in items_in_block: md_item.Print(output_file) def ParseBlockSequence(self, initial_index, type, dictionary): '''Follow the sequence of next_block_index to parse all blocks in the chain''' if initial_index == 0: log.warning('initial_index for block type 0x{:X} was invalid(zero), skipping it!'.format(type)) return self.Seek(initial_index * 0x1000) block_data = self.ReadFromFile(self.block_size) block = StoreBlock(block_data) if block.block_type != type: raise Exception('Not the right block type, got {} instead of {} !!'.format(block.block_type, type)) self.ProcessBlock(block, dictionary) while block.next_block_index != 0: self.Seek(block.next_block_index * 0x1000) block_data = self.ReadFromFile(self.block_size) block = StoreBlock(block_data) if block.block_type != type: raise Exception('Not the right block type, got {} instead of {} !!'.format(block.block_type, type)) self.ProcessBlock(block, dictionary) def ReadPageIndexesAndOtherDefinitions(self, only_read_block_0=False): '''Reads block zero that lists page indexes, then reads properties, categories and indexes''' self.Seek(self.header_size) block0_data = self.ReadFromFile(self.block0_size) self.block0 = StoreBlock0(block0_data) if not only_read_block_0: self.ParseBlockSequence(self.index_blocktype_11, BlockType.PROPERTY, self.properties) self.ParseBlockSequence(self.index_blocktype_21, BlockType.CATEGORY, self.categories) self.ParseBlockSequence(self.index_blocktype_81_1, BlockType.INDEX, self.indexes_1) self.ParseBlockSequence(self.index_blocktype_81_2, BlockType.INDEX, self.indexes_2) self.ParseBlockSequence(self.index_blocktype_41, BlockType.UNKNOWN_41, None) def ReadBlocksNoSeq(self): '''Reads all blocks as is, without consideration for sequence,, may miss or exclude some data or may read invalid data, if its a deleted chunk??''' # TODO: This function is incomplete! Do not use! self.Seek(self.header_size) block0_data = self.ReadFromFile(self.block0_size) block0 = StoreBlock0(block0_data) self.Seek(self.header_size + block0.physical_size) while self.pos < self.file_size: block_data = self.ReadFromFile(self.block_size) block = StoreBlock(block_data) self.ProcessBlock(block) if block.physical_size != self.block_size: raise Exception("Block size mismatch!") self.Seek(self.pos + self.block_size) def RecursiveGetFullPath(item, items_list): '''Return full path to given item, here items_list is dictionary''' # item = [id, parent_id, name, full_path, date] if item[3]: return item[3] if item[0] == 1: #is this plist? return 'plist' name = item[2] if item[0] == 2: # This is root if name == '': name = '/' item[3] = name return name search_id = item[1] if search_id == 0: search_id = 2 # root ret_path = '' found_item = items_list.get(search_id, None) if found_item != None: parent_path = RecursiveGetFullPath(found_item, items_list) ret_path = (parent_path + '/' + name) if parent_path != '/' else (parent_path + name) found_item[3] = parent_path elif search_id == 2: # root ret_path = ('/' + name) if name else '/' else: log.debug ('Err, could not find path for id {} '.format(search_id)) ret_path = '..NOT-FOUND../' + name return ret_path def GetFileData(path): data = b'' with open(path, 'rb') as f: data = f.read() return data def GetMapDataOffsetHeader(input_folder, id): ''' Given an id X, this returns the data from 3 files, dbStr-X.map.data, dbStr-X.map.header, dbStr-X.map.offsets. It will search for these files in the input_folder. Returns tuple (data, offsets, header) ''' data_path = os.path.join(input_folder, 'dbStr-{}.map.data'.format(id)) offsets_path = os.path.join(input_folder, 'dbStr-{}.map.offsets'.format(id)) header_path = os.path.join(input_folder, 'dbStr-{}.map.header'.format(id)) map_data = GetFileData(data_path) offsets_data = GetFileData(offsets_path) header_data = GetFileData(header_path) return (map_data, offsets_data, header_data) def ProcessStoreDb(input_file_path, output_path, file_name_prefix='store'): '''Main processing function''' items = {} time_processing_started = time.time() create_full_paths_output_file = True output_path_full_paths = os.path.join(output_folder, file_name_prefix + '_fullpaths.csv') output_path_data = os.path.join(output_folder, file_name_prefix + '_data.txt') log.info('Processing ' + input_file_path) try: f = open(input_file_path, 'rb') store = SpotlightStore(f) if store.flags & 0x00010000 == 0x00010000: create_full_paths_output_file = False log.info('This appears to be either an iOS spotlight db or a user spotlight db. '\ "File inode numbers are not stored here, and hence full_path file won't be created!") # check if needs other files if store.index_blocktype_11 == 0: # The properties, categories and indexes must be stored in external files # Find and parse files input_folder = os.path.dirname(os.path.abspath(input_file_path)) data_path = os.path.join(input_folder, 'dbStr-1.map.data') if os.path.isfile(data_path): try: prop_map_data, prop_map_offsets,prop_map_header = GetMapDataOffsetHeader(input_folder, 1) cat_map_data, cat_map_offsets, cat_map_header = GetMapDataOffsetHeader(input_folder, 2) idx_1_map_data, idx_1_map_offsets, idx_1_map_header = GetMapDataOffsetHeader(input_folder, 4) idx_2_map_data, idx_2_map_offsets, idx_2_map_header = GetMapDataOffsetHeader(input_folder, 5) store.ParsePropertiesFromFileData(prop_map_data, prop_map_offsets, prop_map_header) store.ParseCategoriesFromFileData(cat_map_data, cat_map_offsets, cat_map_header) store.ParseIndexesFromFileData(idx_1_map_data, idx_1_map_offsets, idx_1_map_header, store.indexes_1) store.ParseIndexesFromFileData(idx_2_map_data, idx_2_map_offsets, idx_2_map_header, store.indexes_2, has_extra_byte=True) store.ReadPageIndexesAndOtherDefinitions(True) except: log.exception('Failed to find or process one or more dependency files. Cannot proceed!') f.close() return else: log.error('Did not find file dbStr-1.map.data in the same folder as store.db. In order to parse this file' + ' please also have all files that begin with dbStr* in the same location as store.db. It will ' + ' be in the same folder where you found store.db') f.close() return else: store.ReadPageIndexesAndOtherDefinitions() log.info("Creating output file {}".format(output_path_data)) with open(output_path_data, 'wb') as output_file: store.ParseMetadataBlocks(output_file, items, None, None) if create_full_paths_output_file: log.info("Creating output file {}".format(output_path_full_paths)) with open(output_path_full_paths, 'wb') as output_paths_file: output_paths_file.write("Inode_Number\tFull_Path\r\n".encode('utf-8')) for k, v in items.items(): name = v[2] if name: fullpath = RecursiveGetFullPath(v, items) to_write = str(k) + '\t' + fullpath + '\r\n' output_paths_file.write(to_write.encode('utf-8', 'backslashreplace')) except Exception as ex: log.exception('') finally: f.close() time_processing_ended = time.time() run_time = time_processing_ended - time_processing_started log.info("Finished in time = {}".format(time.strftime('%H:%M:%S', time.gmtime(run_time)))) if __name__ == "__main__": import argparse description = "This script will process individual Spotlight database files.\n"\ "These files are found under the volume at location \n "\ "'/.Spotlight-V100/Store-V2/<UUID>' where <UUID> represents a store id.\n"\ "In that folder you should find files named 'store' and '.store' which\n"\ "are the Spotlight databases. Provide these as input to this script. \n\n"\ "iOS Spotlight databases (store.db and .store.db) are found at locations:\n"\ "/private/var/mobile/Library/Spotlight/CoreSpotlight/***/index.spotlightV2\n"\ "where *** is one of NSFileProtectionComplete, NSFileProtectionCompleteUnlessOpen\n"\ "or NSFileProtectionCompleteUntilFirstUserAuthentication folders.\n\n"\ "For iOS databases, you will need to have the files that begin with 'dbStr'\n"\ "in the same folder as store.db. These files will natively be found in the\n"\ "same folder as store.db and are specific to that instance of store.db.\n\n"\ "Example: python.exe spotlight_parser.py c:\store.db c:\store_output\n\n"\ "Send bugs/comments to yogesh@swiftforensics.com " arg_parser = argparse.ArgumentParser(description='Spotlight Parser version {} - {}'.format(__VERSION__, description), formatter_class=argparse.RawTextHelpFormatter) arg_parser.add_argument('input_path', help="Path to 'store' or '.store' file (the Spotlight db)") arg_parser.add_argument('output_folder', help='Path to output folder') arg_parser.add_argument('-p', '--output_prefix', help='Prefix for output file names') args = arg_parser.parse_args() output_folder = args.output_folder output_file_prefix = args.output_prefix if args.output_prefix else 'spotlight-store' # log log_level = logging.DEBUG log_console_handler = logging.StreamHandler() log_console_handler.setLevel(log_level) log_console_format = logging.Formatter('%(levelname)s - %(message)s') log_console_handler.setFormatter(log_console_format) log.addHandler(log_console_handler) log.setLevel(log_level) if not os.path.exists(output_folder): log.info("Output folder '{}' does not exist! Creating it for you.".format(output_folder)) os.makedirs(output_folder) if not os.path.exists(args.input_path): log.error("Input file'{}' does not exist".format(args.input_path)) sys.exit() ProcessStoreDb(args.input_path, output_folder, output_file_prefix)