#!/usr/local/bin/python3 import os, sys import hashlib import argparse import humanfriendly parser = argparse.ArgumentParser(description='What the dupe!?') optional = parser._action_groups.pop() # Edited this line required = parser.add_argument_group('required arguments') optional.add_argument('--size', type=str, help='Only output files greater than \'size\'. 16, 16K, 16M, 16G, 16T') required.add_argument('--dir', type=str, nargs='?', required=True, action='append', help='Directory to scan. Can be used multiple times.') parser._action_groups.append(optional) args = parser.parse_args() max_size1='10M' max_size2='50M' max_size3='100M' max_size4='1G' max_size5='5G' max_size6='5G' sizes = [max_size1, max_size2, max_size3, max_size4, max_size5, max_size6] if args.size: bytes = humanfriendly.parse_size(args.size) else: bytes = 0 def findDup(parentFolder): # Dups in format {hash:[names]} dups = {} print() for dirName, subdirs, fileList in os.walk(parentFolder): print(' Scanning %s...' % dirName) for filename in fileList: # Get the path to the file path = os.path.join(dirName, filename) # Calculate hash if os.path.exists(path): # Calculate hash file_hash = hashfile(path) # Add or append the file path if file_hash in dups: dups[file_hash].append(path) else: dups[file_hash] = [path] return dups # Joins two dictionaries def joinDicts(dict1, dict2): for key in dict2.keys(): if key in dict1: dict1[key] = dict2[key] else: dict1[key] = dict2[key] def hashfile(path, blocksize = 65536): afile = open(path, 'rb') hasher = hashlib.md5() buf = afile.read(blocksize) while len(buf) > 0: hasher.update(buf) buf = afile.read(blocksize) afile.close() return hasher.hexdigest() def printResults(dict1): final = {max_size1:[], max_size2:[], max_size3:[], max_size4:[], max_size5:[], max_size6:[]} if bytes > 0: final[bytes] = [] results = list(filter(lambda x: len(x) > 1, dict1.values())) for result in results: file_size = os.path.getsize(result[0]) if bytes > 0: if file_size >= bytes: final[bytes].append(result) else: if file_size >= humanfriendly.parse_size(max_size1) and file_size < humanfriendly.parse_size(max_size2): final[max_size2].append(result) #print('1M-50M: '+result) elif file_size >= humanfriendly.parse_size(max_size2) and file_size < humanfriendly.parse_size(max_size3): final[max_size3].append(result) #print('50M-100M: '+result) elif file_size >= humanfriendly.parse_size(max_size3) and file_size < humanfriendly.parse_size(max_size4): #print('100M-1G: '+result) final[max_size4].append(result) elif file_size >= humanfriendly.parse_size(max_size4) and file_size < humanfriendly.parse_size(max_size5): #print('1G-5G: '+result) final[max_size5].append(result) elif file_size >= humanfriendly.parse_size(max_size5): #print('5G+: '+result) final[max_size6].append(result) else: #print('<1M: '+result) final[max_size1].append(result) if len(results) > 0: print('___________________') print('\n\033[1;34m\033[1;34m\u25b6 Duplicates Found\033[0m\n') print(' The following files are identical. The name could differ, but the content is identical') print('___________________') new = ['0'] if bytes > 0: print("\n\033[1;34m\u25b6 Files bigger than %s\033[0m" % humanfriendly.format_size(bytes, binary=True)) for dupe in final[bytes]: print('___________________\n') for file in dupe: print(' %s' % str(file)) print('___________________') else: for size in sizes: new.append(size) if len(final[size]) > 0: print("\n\033[1;34m\u25b6 Between %s and %s\033[0m" % (new[-2],size)) for dupe in final[size]: print('___________________\n') for file in dupe: print(' %s' % str(file)) print('___________________') else: print('No duplicate files found.') if __name__ == '__main__': if len(sys.argv) > 1: dups = {} folders = args.dir for i in folders: # Iterate the folders given if os.path.exists(i): # Find the duplicated files and append them to the dups joinDicts(dups, findDup(i)) else: print('%s is not a valid path, please verify' % i) sys.exit() printResults(dups) else: print('Usage: python dupFinder.py folder or python dupFinder.py folder1 folder2 folder3')