557 lines
16 KiB
Python
Executable File
557 lines
16 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
# Copyright (c) 2016, Antonio SJ Musumeci <trapexit@spawn.link>
|
|
|
|
# Permission to use, copy, modify, and/or distribute this software for any
|
|
# purpose with or without fee is hereby granted, provided that the above
|
|
# copyright notice and this permission notice appear in all copies.
|
|
|
|
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
|
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
|
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
|
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
|
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|
|
|
import argparse
|
|
import ctypes
|
|
import errno
|
|
import fnmatch
|
|
import hashlib
|
|
import io
|
|
import os
|
|
import random
|
|
import shlex
|
|
import sys
|
|
|
|
|
|
_libc = ctypes.CDLL("libc.so.6",use_errno=True)
|
|
_lgetxattr = _libc.lgetxattr
|
|
_lgetxattr.argtypes = [ctypes.c_char_p,ctypes.c_char_p,ctypes.c_void_p,ctypes.c_size_t]
|
|
def lgetxattr(path,name):
|
|
if type(path) == str:
|
|
path = path.encode(errors='backslashreplace')
|
|
if type(name) == str:
|
|
name = name.encode(errors='backslashreplace')
|
|
length = 64
|
|
while True:
|
|
buf = ctypes.create_string_buffer(length)
|
|
res = _lgetxattr(path,name,buf,ctypes.c_size_t(length))
|
|
if res >= 0:
|
|
return buf.raw[0:res]
|
|
else:
|
|
err = ctypes.get_errno()
|
|
if err == errno.ERANGE:
|
|
length *= 2
|
|
elif err == errno.ENODATA:
|
|
return None
|
|
else:
|
|
raise IOError(err,os.strerror(err),path)
|
|
|
|
|
|
def ismergerfs(path):
|
|
try:
|
|
lgetxattr(path,b'user.mergerfs.fullpath')
|
|
return True
|
|
except IOError as e:
|
|
return False
|
|
|
|
|
|
def hash_file(filepath, hasher=None, blocksize=65536):
|
|
if not hasher:
|
|
hasher = hashlib.md5()
|
|
|
|
with open(filepath,'rb') as afile:
|
|
buf = afile.read(blocksize)
|
|
while buf:
|
|
hasher.update(buf)
|
|
buf = afile.read(blocksize)
|
|
|
|
return hasher.hexdigest()
|
|
|
|
|
|
def short_hash_file(filepath, hasher=None, blocksize=65536, blocks=16):
|
|
if not hasher:
|
|
hasher = hashlib.md5()
|
|
|
|
with open(filepath,'rb') as f:
|
|
size = os.fstat(f.fileno()).st_size
|
|
if size <= blocksize:
|
|
size = 1
|
|
blocks = 1
|
|
|
|
random.seed(size,version=2)
|
|
for _ in range(blocks):
|
|
offset = random.randrange(size)
|
|
f.seek(offset)
|
|
buf = f.read(blocksize)
|
|
if buf:
|
|
hasher.update(buf)
|
|
else:
|
|
break
|
|
|
|
return hasher.hexdigest()
|
|
|
|
|
|
def sizeof_fmt(num):
|
|
for unit in ['','K','M','G','T','P','E','Z']:
|
|
if abs(num) < 1024.0:
|
|
return "%3.1f%sB" % (num,unit)
|
|
num /= 1024.0
|
|
return "%.1f%sB" % (num,'Y')
|
|
|
|
|
|
def stat_files(paths):
|
|
rv = []
|
|
for path in paths:
|
|
try:
|
|
st = os.stat(path)
|
|
rv.append((path,st))
|
|
except:
|
|
pass
|
|
|
|
return rv
|
|
|
|
|
|
def remove(files,execute,verbose):
|
|
for (path,stat) in files:
|
|
try:
|
|
print('rm -vf',shlex.quote(path))
|
|
if execute:
|
|
os.remove(path)
|
|
except Exception as e:
|
|
print("%s" % e)
|
|
|
|
|
|
def print_stats(stats):
|
|
for i in range(0,len(stats)):
|
|
print("# %i: %s" % (i+1,stats[i][0]))
|
|
data = ("# - uid: {0:5}; gid: {1:5}; mode: {2:6o}; "
|
|
"size: {3}; mtime: {4}").format(
|
|
stats[i][1].st_uid,
|
|
stats[i][1].st_gid,
|
|
stats[i][1].st_mode,
|
|
sizeof_fmt(stats[i][1].st_size),
|
|
stats[i][1].st_mtime)
|
|
print(data)
|
|
|
|
|
|
def total_size(stats):
|
|
total = 0
|
|
for (name,stat) in stats:
|
|
total = total + stat.st_size
|
|
return total
|
|
|
|
|
|
def manual_dedup(fullpath,stats):
|
|
done = False
|
|
while not done:
|
|
value = input("# Which to keep? ('s' to skip):")
|
|
|
|
if value.lower() == 's':
|
|
stats.clear()
|
|
done = True
|
|
continue
|
|
|
|
try:
|
|
value = int(value) - 1
|
|
if value < 0 or value >= len(stats):
|
|
raise ValueError
|
|
stats.remove(stats[value])
|
|
done = True
|
|
except NameError:
|
|
print("Input error: enter a value [1-{0}] or skip by entering 's'".format(len(stats)))
|
|
except ValueError:
|
|
print("Input error: enter a value [1-{0}] or skip by entering 's'".format(len(stats)))
|
|
|
|
|
|
def mtime_all(stats):
|
|
mtime = stats[0][1].st_mtime
|
|
return all(x[1].st_mtime == mtime for x in stats)
|
|
|
|
|
|
def mtime_any(mtime,stats):
|
|
return any([st.st_mtime == mtime for (path,st) in stats])
|
|
|
|
|
|
def size_all(stats):
|
|
size = stats[0][1].st_size
|
|
return all(x[1].st_size == size for x in stats)
|
|
|
|
|
|
def size_any(size,stats):
|
|
return any([st.st_size == size for (path,st) in stats])
|
|
|
|
|
|
def md5sums_all(stats):
|
|
if size_all(stats):
|
|
hashval = hash_file(stats[0][0])
|
|
return all(hash_file(path) == hashval for (path,st) in stats[1:])
|
|
return False
|
|
|
|
|
|
def short_md5sums_all(stats):
|
|
if size_all(stats):
|
|
hashval = short_hash_file(stats[0][0])
|
|
return all(short_hash_file(path) == hashval for (path,st) in stats[1:])
|
|
return False
|
|
|
|
|
|
def oldest_dedup(fullpath,stats):
|
|
if size_all(stats) and mtime_all(stats):
|
|
drive_with_most_space_dedup(fullpath,stats)
|
|
return
|
|
|
|
stats.sort(key=lambda st: st[1].st_mtime)
|
|
oldest = stats[0]
|
|
stats.remove(oldest)
|
|
|
|
|
|
def strict_oldest_dedup(fullpath,stats):
|
|
stats.sort(key=lambda st: st[1].st_mtime,reverse=False)
|
|
|
|
oldest = stats[0]
|
|
stats.remove(oldest)
|
|
if mtime_any(oldest[1].st_mtime,stats):
|
|
stats.clear()
|
|
|
|
|
|
def newest_dedup(fullpath,stats):
|
|
if size_all(stats) and mtime_all(stats):
|
|
drive_with_most_space_dedup(fullpath,stats)
|
|
return
|
|
|
|
stats.sort(key=lambda st: st[1].st_mtime,reverse=True)
|
|
newest = stats[0]
|
|
stats.remove(newest)
|
|
|
|
|
|
def strict_newest_dedup(fullpath,stats):
|
|
stats.sort(key=lambda st: st[1].st_mtime,reverse=True)
|
|
|
|
newest = stats[0]
|
|
stats.remove(newest)
|
|
if mtime_any(newest[1].st_mtime,stats):
|
|
stats.clear()
|
|
|
|
|
|
def largest_dedup(fullpath,stats):
|
|
if size_all(stats) and mtime_all(stats):
|
|
drive_with_most_space_dedup(fullpath,stats)
|
|
return
|
|
|
|
stats.sort(key=lambda st: st[1].st_size,reverse=True)
|
|
largest = stats[0]
|
|
stats.remove(largest)
|
|
|
|
|
|
def strict_largest_dedup(fullpath,stats):
|
|
stats.sort(key=lambda st: st[1].st_size,reverse=True)
|
|
|
|
largest = stats[0]
|
|
stats.remove(largest)
|
|
if size_any(largest[1].st_size,stats):
|
|
stats.clear()
|
|
|
|
|
|
def smallest_dedup(fullpath,stats):
|
|
if size_all(stats) and mtime_all(stats):
|
|
drive_with_most_space_dedup(fullpath,stats)
|
|
return
|
|
|
|
stats.sort(key=lambda st: st[1].st_size)
|
|
smallest = stats[0]
|
|
stats.remove(smallest)
|
|
|
|
|
|
def strict_smallest_dedup(fullpath,stats):
|
|
stats.sort(key=lambda st: st[1].st_size,reverse=False)
|
|
|
|
smallest = stats[0]
|
|
stats.remove(smallest)
|
|
if size_any(smallest[1].st_size,stats):
|
|
stats.clear()
|
|
|
|
|
|
def calc_space_free(stat):
|
|
st = os.statvfs(stat[0])
|
|
return st.f_frsize * st.f_bfree
|
|
|
|
|
|
def drive_with_most_space_dedup(fullpath,stats):
|
|
stats.sort(key=calc_space_free,reverse=True)
|
|
largest = stats[0]
|
|
stats.remove(largest)
|
|
|
|
|
|
def mergerfs_getattr_dedup(origpath,stats):
|
|
fullpath = getxattr(origpath,b'user.mergerfs.fullpath')
|
|
for (path,stat) in stats:
|
|
if path != fullpath:
|
|
continue
|
|
stats.remove((path,stat))
|
|
break
|
|
|
|
|
|
def get_dedupfun(name,strict):
|
|
if strict:
|
|
name = 'strict-' + name
|
|
funs = {
|
|
'manual': manual_dedup,
|
|
'strict-manual': manual_dedup,
|
|
'mostfreespace': drive_with_most_space_dedup,
|
|
'strict-mostfreespace': drive_with_most_space_dedup,
|
|
'newest': newest_dedup,
|
|
'strict-newest': strict_newest_dedup,
|
|
'oldest': oldest_dedup,
|
|
'strict-oldest': strict_oldest_dedup,
|
|
'largest': largest_dedup,
|
|
'strict-largest': strict_largest_dedup,
|
|
'smallest': smallest_dedup,
|
|
'strict-smallest': strict_smallest_dedup,
|
|
'mergerfs': mergerfs_getattr_dedup,
|
|
'strict-mergerfs': mergerfs_getattr_dedup
|
|
}
|
|
return funs[name]
|
|
|
|
|
|
def get_ignorefun(name):
|
|
funs = {
|
|
None: lambda x: None,
|
|
'same-time': mtime_all,
|
|
'diff-time': lambda x: not mtime_all(x),
|
|
'same-size': size_all,
|
|
'diff-size': lambda x: not size_all(x),
|
|
'same-hash': md5sums_all,
|
|
'diff-hash': lambda x: not md5sums_all(x),
|
|
'same-short-hash': short_md5sums_all,
|
|
'diff-short-hash': lambda x: not short_md5sums_all(x)
|
|
}
|
|
|
|
return funs[name]
|
|
|
|
|
|
def getxattr(path,key):
|
|
try:
|
|
attr = lgetxattr(path,key)
|
|
if attr:
|
|
return attr.decode('utf-8')
|
|
return ''
|
|
except IOError as e:
|
|
if e.errno == errno.ENODATA:
|
|
return ''
|
|
raise
|
|
except UnicodeDecodeError as e:
|
|
print(e)
|
|
print(attr)
|
|
return ''
|
|
|
|
|
|
def match(filename,matches):
|
|
for match in matches:
|
|
if fnmatch.fnmatch(filename,match):
|
|
return True
|
|
return False
|
|
|
|
|
|
def dedup(fullpath,verbose,ignorefun,execute,dedupfun):
|
|
paths = getxattr(fullpath,b'user.mergerfs.allpaths').split('\0')
|
|
if len(paths) <= 1:
|
|
return 0
|
|
|
|
stats = stat_files(paths)
|
|
|
|
if ignorefun(stats):
|
|
if verbose >= 2:
|
|
print('# ignored:',fullpath)
|
|
return 0
|
|
|
|
if (dedupfun == manual_dedup):
|
|
print('#',fullpath)
|
|
print_stats(stats)
|
|
|
|
try:
|
|
dedupfun(fullpath,stats)
|
|
if not stats:
|
|
if verbose >= 2:
|
|
print('# skipped:',fullpath)
|
|
return 0
|
|
|
|
if (dedupfun != manual_dedup):
|
|
if verbose >= 2:
|
|
print('#',fullpath)
|
|
if verbose >= 3:
|
|
print_stats(stats)
|
|
|
|
for (path,stat) in stats:
|
|
try:
|
|
if verbose:
|
|
print('rm -vf',shlex.quote(path))
|
|
if execute:
|
|
os.remove(path)
|
|
except Exception as e:
|
|
print('#',e)
|
|
|
|
return total_size(stats)
|
|
|
|
except Exception as e:
|
|
print(e)
|
|
|
|
return 0
|
|
|
|
|
|
def print_help():
|
|
help = \
|
|
'''
|
|
usage: mergerfs.dedup [<options>] <dir>
|
|
|
|
Remove duplicate files across branches of a mergerfs pool. Provides
|
|
multiple algos for determining which file to keep and what to skip.
|
|
|
|
positional arguments:
|
|
dir Starting directory
|
|
|
|
optional arguments:
|
|
-v, --verbose Once to print `rm` commands
|
|
Twice for status info
|
|
Three for file info
|
|
-i, --ignore= Ignore files if... (default: none)
|
|
* same-size : have the same size
|
|
* diff-size : have different sizes
|
|
* same-time : have the same mtime
|
|
* diff-time : have different mtimes
|
|
* same-hash : have the same md5sum
|
|
* diff-hash : have different md5sums
|
|
* same-short-hash : have the same short md5sums
|
|
* diff-short-hash : have different short md5sums
|
|
'hash' is expensive. 'short-hash' far less
|
|
expensive, not as safe, but pretty good.
|
|
-d, --dedup= What file to *keep* (default: mergerfs)
|
|
* manual : ask user
|
|
* oldest : file with smallest mtime
|
|
* newest : file with largest mtime
|
|
* largest : file with largest size
|
|
* smallest : file with smallest size
|
|
* mostfreespace : file on drive with most free space
|
|
* mergerfs : file selected by the mergerfs
|
|
getattr policy
|
|
-s, --strict Skip dedup if all files have same (mtime,size) value.
|
|
Only applies to oldest, newest, largest, smallest.
|
|
-e, --execute Will not perform file removal without this.
|
|
-I, --include= fnmatch compatible filter to include files.
|
|
Can be used multiple times.
|
|
-E, --exclude= fnmatch compatible filter to exclude files.
|
|
Can be used multiple times.
|
|
-D, --exclude-dir= Directories to exclude from search.
|
|
Can be used multiple times.
|
|
|
|
'''
|
|
print(help)
|
|
|
|
|
|
def buildargparser():
|
|
desc = 'dedup files across branches in a mergerfs pool'
|
|
usage = 'mergerfs.dedup [<options>] <dir>'
|
|
parser = argparse.ArgumentParser(add_help=False)
|
|
|
|
parser.add_argument('dir',
|
|
type=str,
|
|
nargs='?',
|
|
default=None,
|
|
help='starting directory')
|
|
parser.add_argument('-v','--verbose',
|
|
action='count',
|
|
default=0)
|
|
parser.add_argument('-i','--ignore',
|
|
choices=['same-size','diff-size',
|
|
'same-time','diff-time',
|
|
'same-hash','diff-hash',
|
|
'same-short-hash',
|
|
'diff-short-hash'])
|
|
parser.add_argument('-d','--dedup',
|
|
choices=['manual',
|
|
'oldest','newest',
|
|
'smallest','largest',
|
|
'mostfreespace',
|
|
'mergerfs'],
|
|
default='mergerfs')
|
|
parser.add_argument('-s','--strict',
|
|
action='store_true')
|
|
parser.add_argument('-e','--execute',
|
|
action='store_true')
|
|
parser.add_argument('-I','--include',
|
|
type=str,
|
|
action='append',
|
|
default=[])
|
|
parser.add_argument('-E','--exclude',
|
|
type=str,
|
|
action='append',
|
|
default=[])
|
|
parser.add_argument('-D','--exclude-dir',
|
|
dest='excludedir',
|
|
type=str,
|
|
action='append',
|
|
default=[])
|
|
parser.add_argument('-h','--help',
|
|
action='store_true')
|
|
|
|
return parser
|
|
|
|
|
|
def main():
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,
|
|
encoding='utf8',
|
|
errors='backslashreplace',
|
|
line_buffering=True)
|
|
sys.stderr = io.TextIOWrapper(sys.stderr.buffer,
|
|
encoding='utf8',
|
|
errors='backslashreplace',
|
|
line_buffering=True)
|
|
|
|
parser = buildargparser()
|
|
args = parser.parse_args()
|
|
|
|
if args.help or not args.dir:
|
|
print_help()
|
|
sys.exit(0)
|
|
|
|
args.dir = os.path.realpath(args.dir)
|
|
if not ismergerfs(args.dir):
|
|
print("%s is not a mergerfs directory" % args.dir)
|
|
sys.exit(1)
|
|
|
|
dedupfun = get_dedupfun(args.dedup,args.strict)
|
|
ignorefun = get_ignorefun(args.ignore)
|
|
verbose = args.verbose
|
|
execute = args.execute
|
|
includes = ['*'] if not args.include else args.include
|
|
excludes = args.exclude
|
|
|
|
total_size = 0
|
|
try:
|
|
for (dirname,dirnames,filenames) in os.walk(args.dir, topdown=True):
|
|
dirnames[:] = [dirname for dirname in dirnames if dirname not in args.excludedir]
|
|
for filename in filenames:
|
|
if match(filename,excludes):
|
|
continue
|
|
if not match(filename,includes):
|
|
continue
|
|
fullpath = os.path.join(dirname,filename)
|
|
total_size += dedup(fullpath,verbose,ignorefun,execute,dedupfun)
|
|
except KeyboardInterrupt:
|
|
print("# exiting: CTRL-C pressed")
|
|
except IOError as e:
|
|
if e.errno == errno.EPIPE:
|
|
pass
|
|
else:
|
|
raise
|
|
|
|
print('# Total savings:',sizeof_fmt(total_size))
|
|
|
|
sys.exit(0)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|