mergerfs-tools/src/mergerfs.dedup

#!/usr/bin/env python3

# Copyright (c) 2016, Antonio SJ Musumeci <trapexit@spawn.link>

# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.

# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

import argparse
import ctypes
import errno
import fnmatch
import hashlib
import io
import os
import random
import shlex
import sys


_libc = ctypes.CDLL("libc.so.6",use_errno=True)
_lgetxattr = _libc.lgetxattr
_lgetxattr.argtypes = [ctypes.c_char_p,ctypes.c_char_p,ctypes.c_void_p,ctypes.c_size_t]
def lgetxattr(path,name):
    if type(path) == str:
        path = path.encode(errors='backslashreplace')
    if type(name) == str:
        name = name.encode(errors='backslashreplace')
    length = 64
    while True:
        buf = ctypes.create_string_buffer(length)
        res = _lgetxattr(path,name,buf,ctypes.c_size_t(length))
        if res >= 0:
            return buf.raw[0:res]
        else:
            err = ctypes.get_errno()
            if err == errno.ERANGE:
                length *= 2
            elif err == errno.ENODATA:
                return None
            else:
                raise IOError(err,os.strerror(err),path)


def ismergerfs(path):
    try:
        lgetxattr(path,b'user.mergerfs.fullpath')
        return True
    except IOError as e:
        return False


def hash_file(filepath, hasher=None, blocksize=65536):
    if not hasher:
        hasher = hashlib.md5()

    with open(filepath,'rb') as afile:
        buf = afile.read(blocksize)
        while buf:
            hasher.update(buf)
            buf = afile.read(blocksize)

    return hasher.hexdigest()


def short_hash_file(filepath, hasher=None, blocksize=65536, blocks=16):
    if not hasher:
        hasher = hashlib.md5()

    with open(filepath,'rb') as f:
        size = os.fstat(f.fileno()).st_size
        if size <= blocksize:
            size = 1
            blocks = 1

        random.seed(size,version=2)
        for _ in range(blocks):
            offset = random.randrange(size)
            f.seek(offset)
            buf = f.read(blocksize)
            if buf:
                hasher.update(buf)
            else:
                break

    return hasher.hexdigest()


def sizeof_fmt(num):
    for unit in ['','K','M','G','T','P','E','Z']:
        if abs(num) < 1024.0:
            return "%3.1f%sB" % (num,unit)
        num /= 1024.0
    return "%.1f%sB" % (num,'Y')


def stat_files(paths):
    rv = []
    for path in paths:
        try:
            st = os.stat(path)
            rv.append((path,st))
        except:
            pass

    return rv


def remove(files,execute,verbose):
    for (path,stat) in files:
        try:
            print('rm -vf',shlex.quote(path))
            if execute:
                os.remove(path)
        except Exception as e:
            print("%s" % e)


def print_stats(stats):
    for i in range(0,len(stats)):
        print("#  %i: %s" % (i+1,stats[i][0]))
        data = ("#   - uid: {0:5}; gid: {1:5}; mode: {2:6o}; "
                "size: {3}; mtime: {4}").format(
            stats[i][1].st_uid,
            stats[i][1].st_gid,
            stats[i][1].st_mode,
            sizeof_fmt(stats[i][1].st_size),
            stats[i][1].st_mtime)
        print(data)


def total_size(stats):
    total = 0
    for (name,stat) in stats:
        total = total + stat.st_size
    return total


def manual_dedup(fullpath,stats):
    done = False
    while not done:
        value = input("# Which to keep? ('s' to skip):")

        if value.lower() == 's':
            stats.clear()
            done = True
            continue

        try:
            value = int(value) - 1
            if value < 0 or value >= len(stats):
                raise ValueError
            stats.remove(stats[value])
            done = True
        except NameError:
            print("Input error: enter a value [1-{0}] or skip by entering 's'".format(len(stats)))
        except ValueError:
            print("Input error: enter a value [1-{0}] or skip by entering 's'".format(len(stats)))


def mtime_all(stats):
    mtime = stats[0][1].st_mtime
    return all(x[1].st_mtime == mtime for x in stats)


def mtime_any(mtime,stats):
    return any([st.st_mtime == mtime for (path,st) in stats])


def size_all(stats):
    size = stats[0][1].st_size
    return all(x[1].st_size == size for x in stats)


def size_any(size,stats):
    return any([st.st_size == size for (path,st) in stats])


def md5sums_all(stats):
    if size_all(stats):
        hashval = hash_file(stats[0][0])
        return all(hash_file(path) == hashval for (path,st) in stats[1:])
    return False


def short_md5sums_all(stats):
    if size_all(stats):
        hashval = short_hash_file(stats[0][0])
        return all(short_hash_file(path) == hashval for (path,st) in stats[1:])
    return False


def oldest_dedup(fullpath,stats):
    if size_all(stats) and mtime_all(stats):
        drive_with_most_space_dedup(fullpath,stats)
        return

    stats.sort(key=lambda st: st[1].st_mtime)
    oldest = stats[0]
    stats.remove(oldest)


def strict_oldest_dedup(fullpath,stats):
    stats.sort(key=lambda st: st[1].st_mtime,reverse=False)

    oldest = stats[0]
    stats.remove(oldest)
    if mtime_any(oldest[1].st_mtime,stats):
        stats.clear()


def newest_dedup(fullpath,stats):
    if size_all(stats) and mtime_all(stats):
        drive_with_most_space_dedup(fullpath,stats)
        return

    stats.sort(key=lambda st: st[1].st_mtime,reverse=True)
    newest = stats[0]
    stats.remove(newest)


def strict_newest_dedup(fullpath,stats):
    stats.sort(key=lambda st: st[1].st_mtime,reverse=True)

    newest = stats[0]
    stats.remove(newest)
    if mtime_any(newest[1].st_mtime,stats):
        stats.clear()


def largest_dedup(fullpath,stats):
    if size_all(stats) and mtime_all(stats):
        drive_with_most_space_dedup(fullpath,stats)
        return

    stats.sort(key=lambda st: st[1].st_size,reverse=True)
    largest = stats[0]
    stats.remove(largest)


def strict_largest_dedup(fullpath,stats):
    stats.sort(key=lambda st: st[1].st_size,reverse=True)

    largest = stats[0]
    stats.remove(largest)
    if size_any(largest[1].st_size,stats):
        stats.clear()


def smallest_dedup(fullpath,stats):
    if size_all(stats) and mtime_all(stats):
        drive_with_most_space_dedup(fullpath,stats)
        return

    stats.sort(key=lambda st: st[1].st_size)
    smallest = stats[0]
    stats.remove(smallest)


def strict_smallest_dedup(fullpath,stats):
    stats.sort(key=lambda st: st[1].st_size,reverse=False)

    smallest = stats[0]
    stats.remove(smallest)
    if size_any(smallest[1].st_size,stats):
        stats.clear()


def calc_space_free(stat):
    st = os.statvfs(stat[0])
    return st.f_frsize * st.f_bfree


def drive_with_most_space_dedup(fullpath,stats):
    stats.sort(key=calc_space_free,reverse=True)
    largest = stats[0]
    stats.remove(largest)


def mergerfs_getattr_dedup(origpath,stats):
    fullpath = getxattr(origpath,b'user.mergerfs.fullpath')
    for (path,stat) in stats:
        if path != fullpath:
            continue
        stats.remove((path,stat))
        break


def get_dedupfun(name,strict):
    if strict:
        name = 'strict-' + name
    funs = {
        'manual': manual_dedup,
        'strict-manual': manual_dedup,
        'mostfreespace': drive_with_most_space_dedup,
        'strict-mostfreespace': drive_with_most_space_dedup,
        'newest': newest_dedup,
        'strict-newest': strict_newest_dedup,
        'oldest': oldest_dedup,
        'strict-oldest': strict_oldest_dedup,
        'largest': largest_dedup,
        'strict-largest': strict_largest_dedup,
        'smallest': smallest_dedup,
        'strict-smallest': strict_smallest_dedup,
        'mergerfs': mergerfs_getattr_dedup,
        'strict-mergerfs': mergerfs_getattr_dedup
    }
    return funs[name]


def get_ignorefun(name):
    funs = {
        None: lambda x: None,
        'same-time': mtime_all,
        'diff-time': lambda x: not mtime_all(x),
        'same-size': size_all,
        'diff-size': lambda x: not size_all(x),
        'same-hash': md5sums_all,
        'diff-hash': lambda x: not md5sums_all(x),
        'same-short-hash': short_md5sums_all,
        'diff-short-hash': lambda x: not short_md5sums_all(x)
    }

    return funs[name]


def getxattr(path,key):
    try:
        attr = lgetxattr(path,key)
        if attr:
            return attr.decode('utf-8')
        return ''
    except IOError as e:
        if e.errno == errno.ENODATA:
            return ''
        raise
    except UnicodeDecodeError as e:
        print(e)
        print(attr)
    return ''


def match(filename,matches):
    for match in matches:
        if fnmatch.fnmatch(filename,match):
            return True
    return False


def dedup(fullpath,verbose,ignorefun,execute,dedupfun):
    paths = getxattr(fullpath,b'user.mergerfs.allpaths').split('\0')
    if len(paths) <= 1:
        return 0

    stats = stat_files(paths)

    if ignorefun(stats):
        if verbose >= 2:
            print('# ignored:',fullpath)
        return 0

    if (dedupfun == manual_dedup):
        print('#',fullpath)
        print_stats(stats)

    try:
        dedupfun(fullpath,stats)
        if not stats:
            if verbose >= 2:
                print('# skipped:',fullpath)
            return 0

        if (dedupfun != manual_dedup):
            if verbose >= 2:
                print('#',fullpath)
            if verbose >= 3:
                print_stats(stats)

        for (path,stat) in stats:
            try:
                if verbose:
                    print('rm -vf',shlex.quote(path))
                if execute:
                    os.remove(path)
            except Exception as e:
                print('#',e)

        return total_size(stats)

    except Exception as e:
        print(e)

    return 0


def print_help():
    help = \
'''
usage: mergerfs.dedup [<options>] <dir>

Remove duplicate files across branches of a mergerfs pool. Provides
multiple algos for determining which file to keep and what to skip.

positional arguments:
  dir                    Starting directory

optional arguments:
  -v, --verbose          Once to print `rm` commands
                         Twice for status info
                         Three for file info
  -i, --ignore=          Ignore files if... (default: none)
                         * same-size       : have the same size
                         * diff-size       : have different sizes
                         * same-time       : have the same mtime
                         * diff-time       : have different mtimes
                         * same-hash       : have the same md5sum
                         * diff-hash       : have different md5sums
                         * same-short-hash : have the same short md5sums
                         * diff-short-hash : have different short md5sums
                         'hash' is expensive. 'short-hash' far less
                         expensive, not as safe, but pretty good.
  -d, --dedup=           What file to *keep* (default: mergerfs)
                         * manual        : ask user
                         * oldest        : file with smallest mtime
                         * newest        : file with largest mtime
                         * largest       : file with largest size
                         * smallest      : file with smallest size
                         * mostfreespace : file on drive with most free space
                         * mergerfs      : file selected by the mergerfs
                                           getattr policy
  -s, --strict           Skip dedup if all files have same (mtime,size) value.
                         Only applies to oldest, newest, largest, smallest.
  -e, --execute          Will not perform file removal without this.
  -I, --include=         fnmatch compatible filter to include files.
                         Can be used multiple times.
  -E, --exclude=         fnmatch compatible filter to exclude files.
                         Can be used multiple times.
  -D, --exclude-dir=     Directories to exclude from search.
                         Can be used multiple times.

'''
    print(help)


def buildargparser():
    desc = 'dedup files across branches in a mergerfs pool'
    usage = 'mergerfs.dedup [<options>] <dir>'
    parser = argparse.ArgumentParser(add_help=False)

    parser.add_argument('dir',
                        type=str,
                        nargs='?',
                        default=None,
                        help='starting directory')
    parser.add_argument('-v','--verbose',
                        action='count',
                        default=0)
    parser.add_argument('-i','--ignore',
                        choices=['same-size','diff-size',
                                 'same-time','diff-time',
                                 'same-hash','diff-hash',
                                 'same-short-hash',
                                 'diff-short-hash'])
    parser.add_argument('-d','--dedup',
                        choices=['manual',
                                 'oldest','newest',
                                 'smallest','largest',
                                 'mostfreespace',
                                 'mergerfs'],
                        default='mergerfs')
    parser.add_argument('-s','--strict',
                        action='store_true')
    parser.add_argument('-e','--execute',
                        action='store_true')
    parser.add_argument('-I','--include',
                        type=str,
                        action='append',
                        default=[])
    parser.add_argument('-E','--exclude',
                        type=str,
                        action='append',
                        default=[])
    parser.add_argument('-D','--exclude-dir',
                        dest='excludedir',
                        type=str,
                        action='append',
                        default=[])
    parser.add_argument('-h','--help',
                        action='store_true')

    return parser


def main():
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer,
                                  encoding='utf8',
                                  errors='backslashreplace',
                                  line_buffering=True)
    sys.stderr = io.TextIOWrapper(sys.stderr.buffer,
                                  encoding='utf8',
                                  errors='backslashreplace',
                                  line_buffering=True)

    parser = buildargparser()
    args   = parser.parse_args()

    if args.help or not args.dir:
        print_help()
        sys.exit(0)

    args.dir = os.path.realpath(args.dir)
    if not ismergerfs(args.dir):
        print("%s is not a mergerfs directory" % args.dir)
        sys.exit(1)

    dedupfun  = get_dedupfun(args.dedup,args.strict)
    ignorefun = get_ignorefun(args.ignore)
    verbose   = args.verbose
    execute   = args.execute
    includes  = ['*'] if not args.include else args.include
    excludes  = args.exclude

    total_size = 0
    try:
        for (dirname,dirnames,filenames) in os.walk(args.dir, topdown=True):
            dirnames[:] = [dirname for dirname in dirnames if dirname not in args.excludedir]
            for filename in filenames:
                if match(filename,excludes):
                    continue
                if not match(filename,includes):
                    continue
                fullpath    = os.path.join(dirname,filename)
                total_size += dedup(fullpath,verbose,ignorefun,execute,dedupfun)
    except KeyboardInterrupt:
        print("# exiting: CTRL-C pressed")
    except IOError as e:
        if e.errno == errno.EPIPE:
            pass
        else:
            raise

    print('# Total savings:',sizeof_fmt(total_size))

    sys.exit(0)


if __name__ == "__main__":
    main()