123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164 |
- #!/usr/local/bin/python3
- import os, sys, argparse
- import hashlib
- import humanfriendly
- parser = argparse.ArgumentParser(description='What the dupe!?')
- optional = parser._action_groups.pop()
- required = parser.add_argument_group('required arguments')
- mutuallyexclusive = parser.add_mutually_exclusive_group(required=False)
- mutuallyexclusive.add_argument('--threshold', type=str,
- help='Only output files greater than \'size\', e.g. 100M')
- mutuallyexclusive.add_argument('--sizes', type=str, nargs='+',
- help='Only output files greater than \'size\', e.g. 100M')
- required.add_argument('--dir', type=str, nargs='?', required=True, action='append',
- help='Directory to scan. Can be issued multiple times.')
- optional.add_argument('--exclude', type=str, nargs='?', action='append',
- help='Only output files greater than \'size\', e.g. 100M')
- parser._action_groups.append(optional)
- args = parser.parse_args()
- if args.sizes:
- sizes = args.sizes
- sizes.append('larger than '+sizes[-1])
- else:
- sizes = ['10M', '50M', '100M', '1G', '5G']
- sizes.append('larger than '+sizes[-1])
- if args.exclude:
- exclude=args.exclude
- if args.threshold:
- threshold = humanfriendly.parse_size(args.threshold)
- else:
- threshold = 0
- def findDup(parentFolder):
- # Dups in format {hash:[names]}
- dups = {}
- for dirName, subdirs, fileList in os.walk(parentFolder):
- # remove excluded dirs from list
- for exclude in args.exclude:
- subdirs[:] = [dn for dn in subdirs if dirName+'/'+dn != exclude]
- print(' Scanning %s...' % dirName)
- for filename in fileList:
- # Get the path to the file
- path = os.path.join(dirName, filename)
- # Calculate hash
- if os.path.exists(path):
- # Calculate hash
- file_hash = hashfile(path)
- # Add or append the file path
- if file_hash in dups:
- dups[file_hash].append(path)
- else:
- dups[file_hash] = [path]
- return dups
- # Joins two dictionaries
- def joinDicts(dict1, dict2):
- for key in dict2.keys():
- if key in dict1:
- dict1[key] = dict2[key]
- else:
- dict1[key] = dict2[key]
- def hashfile(path, blocksize = 65536):
- file_size = os.path.getsize(path)
- # Only hash files larger than threshold (if set)
- if threshold == 0 or (threshold > 0 and file_size > threshold):
- try:
- afile = open(path, 'rb')
- hasher = hashlib.md5()
- buf = afile.read(blocksize)
- while len(buf) > 0:
- hasher.update(buf)
- buf = afile.read(blocksize)
- afile.close()
- return hasher.hexdigest()
- except:
- pass
- def printResults(dict1):
- final = {}
- for size in sizes:
- final[size] = []
- del size
- if threshold > 0:
- final[threshold] = []
- results = list(filter(lambda x: len(x) > 1, dict1.values()))
- for result in results:
- file_size = os.path.getsize(result[0])
- if not args.threshold or (args.threshold and args.sizes):
- count = 0
- while count+1 < len(sizes):
- try:
- if file_size >= humanfriendly.parse_size(sizes[count]) and file_size < humanfriendly.parse_size(sizes[count+1] and result not in final[sizes[count+1]]):
- final[sizes[count+1]].append(result)
- print('Hi')
- except:
- print('Hi')
- final[sizes[-1]].append(result)
- count += 1
- if file_size < humanfriendly.parse_size(sizes[0]) and result not in final[sizes[0]]:
- print('Hi')
- final[sizes[1]].append(result)
- else:
- print('Hi')
- if file_size >= threshold:
- final[threshold].append(result)
- print(final)
- test=[x for x in final if len(final[x]) > 0]
- print(test)
- if len(results) > 0 and (len(test) > 0):
- print('___________________')
- print('\n\033[1;34m\033[1;34m\u25b6 Duplicates Found\033[0m\n')
- print(' The following files are identical. The name could differ, but the content is identical')
- print('___________________')
- new = ['0']
- if not args.threshold or (args.threshold and args.sizes):
- for size in sizes:
- new.append(size)
- if len(final[size]) > 0:
- if size == 'larger than ' + sizes[-2]:
- print("\n\033[1;34m\u25b6 >= %s\033[0m" % (new[-2]))
- else:
- print("\n\033[1;34m\u25b6 %s to %s\033[0m" % (new[-2],size))
- for dupe in final[size]:
- print('___________________\n')
- for file in dupe:
- print(' %s' % str(file))
- print('___________________')
- else:
- print("\n\033[1;34m\u25b6 Files bigger than %s\033[0m" % humanfriendly.format_size(threshold, binary=True))
- for dupe in final[threshold]:
- print('___________________\n')
- for file in dupe:
- print(' %s' % str(file))
- print('___________________')
- else:
- print('\n\033[1mNo duplicate files found.\033[0m')
- if __name__ == '__main__':
- dups = {}
- folders = args.dir
- for i in folders:
- # Iterate the folders given
- if os.path.exists(i):
- # Find the duplicated files and append them to the dups
- joinDicts(dups, findDup(i))
- else:
- print('%s is not a valid path, please verify' % i)
- sys.exit()
- printResults(dups)
|