#!/usr/local/bin/python3 import os, sys import hashlib import argparse import humanfriendly parser = argparse.ArgumentParser(description='What the dupe!?') optional = parser._action_groups.pop() # Edited this line required = parser.add_argument_group('required arguments') optional.add_argument('--size', type=str, help='Only output files greater than \'size\'. 16, 16K, 16M, 16G, 16T') required.add_argument('--dir', type=str, nargs='?', required=True, action='append', help='Directory to scan. Can be used multiple times.') parser._action_groups.append(optional) args = parser.parse_args() if args.size: bytes = humanfriendly.parse_size(args.size) else: bytes = 0 def findDup(parentFolder): # Dups in format {hash:[names]} dups = {} for dirName, subdirs, fileList in os.walk(parentFolder): print('Scanning %s...' % dirName) for filename in fileList: # Get the path to the file path = os.path.join(dirName, filename) # Calculate hash if os.path.exists(path): file_size = os.path.getsize(path) if file_size > bytes: file_hash = hashfile(path) # Add or append the file path all = str(humanfriendly.format_size(file_size, binary=True)) + ' ' + path if file_hash in dups: dups[file_hash].append(all) else: dups[file_hash] = [all] return dups # Joins two dictionaries def joinDicts(dict1, dict2): for key in dict2.keys(): if key in dict1: dict1[key] = dict1[key] + dict2[key] else: dict1[key] = dict2[key] def hashfile(path, blocksize = 65536): afile = open(path, 'rb') hasher = hashlib.sha1() buf = afile.read(blocksize) while len(buf) > 0: hasher.update(buf) buf = afile.read(blocksize) afile.close() return hasher.hexdigest() def printResults(dict1): results = list(filter(lambda x: len(x) > 1, dict1.values())) if len(results) > 0: print('\n\033[1m Duplicates Found\033[0m\n') print(' The following files are identical. The name could differ, but the content is identical') print('______________\n') for result in results: for subresult in result: print(' %s' % subresult) print('______________\n') else: if bytes: print('No duplicate files bigger than ' + str(humanfriendly.format_size(bytes, binary=True)) + ' found.') else: print('No duplicate files found.') if __name__ == '__main__': if len(sys.argv) > 1: dups = {} folders = args.dir for i in folders: # Iterate the folders given if os.path.exists(i): # Find the duplicated files and append them to the dups joinDicts(dups, findDup(i)) else: print('%s is not a valid path, please verify' % i) sys.exit() printResults(dups) else: print('Usage: python dupFinder.py folder or python dupFinder.py folder1 folder2 folder3')