# -*- coding: utf-8 -*- from __future__ import absolute_import import os from collections import defaultdict from contextlib import closing from filecmp import cmp as filecmp from math import ceil from multiprocessing.pool import ThreadPool from os.path import abspath from stat import S_IFMT, S_ISLNK import xxhash from .structs import Cache, DupInfo, FileInfo, FilterType, SkipException from .utils.fs import (blksize, checksum, fsdecode, is_archived, is_hidden, is_os64, is_system, remove, sidesum, signature, splitpaths, walk) _xxhash_xxh = xxhash.xxh64 if is_os64 else xxhash.xxh32 _LINKSIZE = 900 if os.name == 'nt' else 60 #: bytes _LITTLESIZE = 100 << 10 #: bytes _BIGSIZE = 100 << 20 #: bytes _SIZERATE = 10 #: percentage _BLKSIZE = 4 << 10 _XXHSIZE = _xxhash_xxh().block_size << 11 CACHE = Cache() def _iterdups(dupinfo): for key, value in dupinfo.dups.items(): if isinstance(value, DupInfo): for subobj, subkey, subvalue in _iterdups(value): yield subobj, subkey, subvalue else: yield dupinfo, key, value def _bufsize(fileinfo): # NOTE: `stat.st_dev` is always zero in Python 2 under Windows. :( if fileinfo.dev: try: blocksize = CACHE.get(fileinfo).blksize except Exception: blocksize = 1 else: blocksize = blksize(fileinfo.path) maxsize = max(blocksize, _XXHSIZE) minsize = min(blocksize, _XXHSIZE) return maxsize - maxsize % minsize def _checksum(fileinfo): try: if S_ISLNK(fileinfo.mode): link = os.readlink(fileinfo.path) hashsum = _xxhash_xxh(link).hexdigest() else: raise AttributeError except AttributeError: bufsize = _bufsize(fileinfo) hashsum = checksum(fileinfo.path, bufsize) return hashsum def _chksize(fileinfo): rate = _SIZERATE blocksize = _BLKSIZE percsize = int(ceil(fileinfo.size / 100.0 * rate)) if blocksize < percsize: percsize -= percsize % blocksize return percsize // 2 def _sidesum(fileinfo): chksize = _chksize(fileinfo) bufsize = _bufsize(fileinfo) hashsums = sidesum(fileinfo.path, chksize, bufsize) return hashsums def _signature(fileinfo): return signature(fileinfo.path) def _filter(func, filelist, dupdict, errlist, onerror): for fileinfo in filelist: try: idkey = func(fileinfo) except SkipException: pass except Exception as exc: if onerror is not None: onerror(exc, fileinfo.path) errlist.append(fileinfo) else: dupdict[idkey].append(fileinfo) return dupdict, errlist def _rulefilter(fltrtype, dupinfo, check, rule, onerror, progress): dups_it = _iterdups(dupinfo) for dupobj, dupkey, filelist in dups_it: try: check(filelist) except SkipException: continue dupdict, errlist = _filter(rule, filelist, defaultdict(list), [], onerror) DupInfo(fltrtype, dupdict, errlist, dupobj, dupkey) if progress is not None: progress(len(filelist)) def _binarycmp(filelist, onerror): file0, file1 = filelist try: if filecmp(file0.path, file1.path, shallow=False): dupdict = {True: filelist} else: dupdict = {} errlist = [] except (IOError, OSError) as exc: if onerror is not None: onerror(exc, abspath(exc.filename)) dupdict = {} errlist = filelist return dupdict, errlist def _binaryfilter(fltrtype, dupinfo, onerror, progress): dups_it = _iterdups(dupinfo) for dupobj, dupkey, filelist in dups_it: try: file0, _ = filelist except ValueError: continue # NOTE: This check can return true one time only; should be optimized? if not file0.size: continue dupdict, errlist = _binarycmp(filelist, onerror) DupInfo(fltrtype, dupdict, errlist, dupobj, dupkey) if progress is not None: progress(2) def _typefilter(fltrtype, dupinfo, onerror, progress): dups_it = _iterdups(dupinfo) for dupobj, dupkey, filelist in dups_it: dupdict, errlist = _filter(lambda f: f[fltrtype], filelist, defaultdict(list), [], onerror) DupInfo(fltrtype, dupdict, errlist, dupobj, dupkey) if progress is not None: progress(len(filelist)) def _signcheck(filelist): # if len(filelist) < 2: # raise SkipException file0 = filelist[0] if not file0.size or _LINKSIZE < file0.size < _LITTLESIZE: raise SkipException elif S_ISLNK(file0.mode): raise SkipException def _sidecheck(filelist): # if len(filelist) < 2: # raise SkipException file0 = filelist[0] if file0.size < _BIGSIZE: raise SkipException elif S_ISLNK(file0.mode): raise SkipException def _hashcheck(filelist): if len(filelist) < 3: raise SkipException elif not filelist[0].size: raise SkipException def _sizecheck(size, minsize, maxsize, scanempties): if not size and not scanempties: raise SkipException elif not minsize <= size <= maxsize: raise SkipException def _rulecheck(path, included_match, excluded_match): if excluded_match(path): raise SkipException elif not included_match(path): raise SkipException def _attrcheck(path, scansystem, scanarchived, scanhidden): if not scanhidden and is_hidden(path): raise SkipException elif not scanarchived and is_archived(path): raise SkipException elif not scansystem and is_system(path): raise SkipException def _filecheck(fileinfo, minsize, maxsize, included_match, excluded_match, scanempties, scansystem, scanarchived, scanhidden): _sizecheck(fileinfo.size, minsize, maxsize, scanempties) _rulecheck(fileinfo.path, included_match, excluded_match) _attrcheck(fileinfo.path, scansystem, scanarchived, scanhidden) return S_IFMT(fileinfo.mode), fileinfo.size def _splitpaths(paths, followlinks): with closing(ThreadPool()) as pool: upaths = pool.imap(fsdecode, paths) return splitpaths(set(upaths), followlinks) def _names_to_info(names, onerror): filelist = [] errlist = [] for filename in names: try: fileinfo = FileInfo(filename) except (IOError, OSError) as exc: filepath = abspath(filename) if onerror is not None: onerror(exc, filepath) errlist.append(filepath) else: filelist.append(fileinfo) return filelist, errlist def _entries_to_info(entries, onerror): filelist = [] errlist = [] for entry in entries: try: st = entry.stat(follow_symlinks=False) fileinfo = FileInfo(entry.name, entry.path, st) except (IOError, OSError) as exc: if onerror is not None: onerror(exc, entry.path) errlist.append(entry.path) else: filelist.append(fileinfo) return filelist, errlist def _filescan(filenames, dupdict, errlist, scnerrlist, scnargs, onerror, progress): filelist, _scnerrlist = _names_to_info(filenames, onerror) scnerrlist.extend(_scnerrlist) def rule(fileinfo): return _filecheck(fileinfo, *scnargs) _filter(rule, filelist, dupdict, errlist, onerror) if progress is not None: progress(len(filelist)) return dupdict, errlist, scnerrlist def _dirscan(dirnames, dupdict, errlist, scnerrlist, scnargs, onerror, followlinks, scanlinks, progress): if onerror is None: def callback(exc): scnerrlist.append(abspath(exc.filename)) else: def callback(exc): filepath = abspath(exc.filename) onerror(exc, filepath) scnerrlist.append(filepath) def rule(fileinfo): return _filecheck(fileinfo, *scnargs) seen = set() for dirname in dirnames: walk_it = walk(dirname, callback, followlinks, seen) for _, files, links in walk_it: if scanlinks: files += links filelist, _scnerrlist = _entries_to_info(files, onerror) scnerrlist.extend(_scnerrlist) _filter(rule, filelist, dupdict, errlist, onerror) if progress is not None: progress(len(filelist)) return dupdict, errlist, scnerrlist def filterdups(fltrtype, dupinfo, onerror, progress): # progress(0) if fltrtype is FilterType.SIGNATURE: _rulefilter(fltrtype, dupinfo, _signcheck, _signature, onerror, progress) elif fltrtype is FilterType.RULE: # NOTE: Just a one-pass check for now... _rulefilter(fltrtype, dupinfo, _sidecheck, _sidesum, onerror, progress) elif fltrtype is FilterType.HASH: _rulefilter(fltrtype, dupinfo, _hashcheck, _checksum, onerror, progress) elif fltrtype is FilterType.BINARY: _binaryfilter(fltrtype, dupinfo, onerror, progress) else: _typefilter(fltrtype, dupinfo, onerror, progress) return dupinfo def _filepurge(filepath, duplist, errlist, trash, onerror): try: remove(filepath, trash) except Exception as exc: if onerror is not None: onerror(exc, filepath) errlist.append(filepath) else: duplist.append(filepath) def _purge(filelist, duplist, errlist, trash, ondel, onerror): if ondel: for fileinfo in filelist: filepath = fileinfo.path try: ondel(filepath) except SkipException: continue _filepurge(filepath, duplist, errlist, trash, onerror) else: for fileinfo in filelist: _filepurge(fileinfo.path, duplist, errlist, trash, onerror) return duplist, errlist def purgedups(dupinfo, trash, ondel, onerror, progress): # progress(0) delduplist = [] delerrlist = [] # NOTE: Keep the oldest of firsts def sort_fn(obj): return obj.index, -obj.mtime, obj.path dups_it = _iterdups(dupinfo) for _, _, filelist in dups_it: duplist = sorted(filelist, key=sort_fn)[1:] _purge(duplist, delduplist, delerrlist, trash, ondel, onerror) if progress is not None: progress(len(filelist)) return delduplist, delerrlist def scandups(paths, sizes, matchers, recursive, followlinks, scanlinks, flags, onerror, progress): # progress(0) dupdict = defaultdict(list) errlist = [] scnerrlist = [] scnargs = sizes + matchers + flags splitted_paths = _splitpaths(paths, followlinks) dirnames, filenames, linknames, _, errnames = splitted_paths scnerrlist.extend(errnames) if scanlinks: filenames += linknames _filescan(filenames, dupdict, errlist, scnerrlist, scnargs, onerror, progress) if recursive: _dirscan(dirnames, dupdict, errlist, scnerrlist, scnargs, onerror, followlinks, scanlinks, progress) dupinfo = DupInfo(FilterType.ID, dupdict, errlist) return dupinfo, scnerrlist