#!/usr/local/bin/python3 import os, sys, argparse import hashlib import humanfriendly parser = argparse.ArgumentParser(description='What the dupe!?') optional = parser._action_groups.pop() required = parser.add_argument_group('required arguments') exclusive= parser.add_mutually_exclusive_group(required=False) exclusive.add_argument('--threshold', type=str, help='Only output files greater than \'size\', e.g. 100M') exclusive.add_argument('--sizes', type=str, nargs='+', help='Only output files greater than \'size\', e.g. 100M') required.add_argument('--dir', type=str, nargs='?', required=True, action='append', help='Directory to scan. Can be issued multiple times.') optional.add_argument('--exclude', type=str, nargs='?', action='append', help='Only output files greater than \'size\', e.g. 100M') parser._action_groups.append(optional) args = parser.parse_args() if args.sizes: sizes = args.sizes sizes.append('larger than '+sizes[-1]) else: sizes = ['10M', '50M', '100M', '1G', '5G'] sizes.append('larger than '+sizes[-1]) if args.exclude: exclude=args.exclude if args.threshold: threshold = humanfriendly.parse_size(args.threshold) else: threshold = 0 def findDup(parentFolder): # Dups in format {hash:[names]} dups = {} print() for dirName, subdirs, fileList in os.walk(parentFolder): # remove excluded dirs from list if args.exclude: for exclude in args.exclude: subdirs[:] = [dn for dn in subdirs if dirName+'/'+dn != exclude] print(' Scanning %s...' % dirName) for filename in fileList: # Get the path to the file path = os.path.join(dirName, filename) # Calculate hash if os.path.exists(path): # Calculate hash file_hash = hashfile(path) # Add or append the file path if file_hash in dups: dups[file_hash].append(path) else: dups[file_hash] = [path] return dups # Joins two dictionaries def joinDicts(dict1, dict2): for key in dict2.keys(): if key in dict1: dict1[key] = dict2[key] else: dict1[key] = dict2[key] def hashfile(path, blocksize = 65536): file_size = os.path.getsize(path) # Only hash files larger than threshold (if set) if threshold == 0 or (threshold > 0 and file_size > threshold): try: afile = open(path, 'rb') hasher = hashlib.md5() buf = afile.read(blocksize) while len(buf) > 0: hasher.update(buf) buf = afile.read(blocksize) afile.close() return hasher.hexdigest() except: pass def printResults(dict1): final = {} for size in sizes: final[size] = [] del size if threshold > 0: final[threshold] = [] results = list(filter(lambda x: len(x) > 1, dict1.values())) for result in results: file_size = os.path.getsize(result[0]) if not args.threshold or (args.threshold and args.sizes): count = 0 while count+1 < len(sizes): try: if file_size >= humanfriendly.parse_size(sizes[count]) and file_size < humanfriendly.parse_size(sizes[count+1]): final[sizes[count+1]].append(result) except: final[sizes[-1]].append(result) count += 1 if args.threshold and args.sizes: pass elif file_size < humanfriendly.parse_size(sizes[0]): final[sizes[0]].append(result) else: if file_size >= threshold: final[threshold].append(result) status=[x for x in final if len(final[x]) > 0] if len(results) > 0 and (len(status) > 0): print('___________________') print('\n\033[1;34m\033[1;34m\u25b6 Duplicates Found\033[0m\n') print(' The following files are identical. The name could differ, but the content is identical') print('___________________') new = ['0'] if not args.threshold or (args.threshold and args.sizes): for size in sizes: new.append(size) if len(final[size]) > 0: if size == 'larger than ' + sizes[-2]: print("\n\033[1;34m\u25b6 >= %s\033[0m" % (new[-2])) else: print("\n\033[1;34m\u25b6 %s to %s\033[0m" % (new[-2],size)) for dupe in final[size]: print('___________________\n') for file in dupe: print(' %s' % str(file)) print('___________________') else: print("\n\033[1;34m\u25b6 Files bigger than %s\033[0m" % humanfriendly.format_size(threshold, binary=True)) for dupe in final[threshold]: print('___________________\n') for file in dupe: print(' %s' % str(file)) print('___________________') else: print('\n\033[1mNo duplicate files found.\033[0m') if __name__ == '__main__': dups = {} folders = args.dir for i in folders: # Iterate the folders given if os.path.exists(i): # Find the duplicated files and append them to the dups joinDicts(dups, findDup(i)) else: print('%s is not a valid path, please verify' % i) sys.exit() printResults(dups)