123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100 |
- import os, sys
- import hashlib
- import argparse
- import humanfriendly
- parser = argparse.ArgumentParser(description='Find dupes')
- optional = parser._action_groups.pop() # Edited this line
- required = parser.add_argument_group('required arguments')
- optional.add_argument('--size', type=str,
- help='Only output files greater than \'size\'. 16, 16K, 16M, 16G, 16T')
- required.add_argument('--dir', type=str, nargs='?', required=True, action='append',
- help='Directory to scan. Can be used multiple times.')
- parser._action_groups.append(optional)
- args = parser.parse_args()
- if args.size:
- bytes = humanfriendly.parse_size(args.size)
- else:
- bytes = 0
- def findDup(parentFolder):
- # Dups in format {hash:[names]}
- dups = {}
- for dirName, subdirs, fileList in os.walk(parentFolder):
- print('Scanning %s...' % dirName)
- for filename in fileList:
- # Get the path to the file
- path = os.path.join(dirName, filename)
- # Calculate hash
- if os.path.exists(path):
- file_size = os.path.getsize(path)
- if file_size > bytes:
- file_hash = hashfile(path)
- # Add or append the file path
- all = str(humanfriendly.format_size(file_size, binary=True)) + ' ' + path
- if file_hash in dups:
- dups[file_hash].append(all)
- else:
- dups[file_hash] = [all]
- return dups
-
-
- # Joins two dictionaries
- def joinDicts(dict1, dict2):
- for key in dict2.keys():
- if key in dict1:
- dict1[key] = dict1[key] + dict2[key]
- else:
- dict1[key] = dict2[key]
-
-
- def hashfile(path, blocksize = 65536):
- afile = open(path, 'rb')
- hasher = hashlib.sha1()
- buf = afile.read(blocksize)
- while len(buf) > 0:
- hasher.update(buf)
- buf = afile.read(blocksize)
- afile.close()
- return hasher.hexdigest()
-
-
- def printResults(dict1):
- results = list(filter(lambda x: len(x) > 1, dict1.values()))
- if len(results) > 0:
- print('\n\033[1m Duplicates Found\033[0m\n')
- print(' The following files are identical. The name could differ, but the content is identical')
- print('______________\n')
- for result in results:
- for subresult in result:
- print(' %s' % subresult)
- print('______________\n')
-
- else:
- if bytes:
- print('No duplicate files bigger than ' + str(humanfriendly.format_size(bytes, binary=True)) + ' found.')
- else:
- print('No duplicate files found.')
-
-
- if __name__ == '__main__':
- if len(sys.argv) > 1:
- dups = {}
- folders = args.dir
- for i in folders:
- # Iterate the folders given
- if os.path.exists(i):
- # Find the duplicated files and append them to the dups
- joinDicts(dups, findDup(i))
- else:
- print('%s is not a valid path, please verify' % i)
- sys.exit()
- printResults(dups)
- else:
- print('Usage: python dupFinder.py folder or python dupFinder.py folder1 folder2 folder3')
|