|
@@ -0,0 +1,148 @@
|
|
|
+#!/usr/local/bin/python3
|
|
|
+import os, sys
|
|
|
+import hashlib
|
|
|
+import argparse
|
|
|
+import humanfriendly
|
|
|
+
|
|
|
+parser = argparse.ArgumentParser(description='What the dupe!?')
|
|
|
+
|
|
|
+optional = parser._action_groups.pop() # Edited this line
|
|
|
+required = parser.add_argument_group('required arguments')
|
|
|
+
|
|
|
+optional.add_argument('--size', type=str,
|
|
|
+ help='Only output files greater than \'size\'. 16, 16K, 16M, 16G, 16T')
|
|
|
+
|
|
|
+required.add_argument('--dir', type=str, nargs='?', required=True, action='append',
|
|
|
+ help='Directory to scan. Can be used multiple times.')
|
|
|
+
|
|
|
+parser._action_groups.append(optional)
|
|
|
+
|
|
|
+args = parser.parse_args()
|
|
|
+
|
|
|
+max_size1='10M'
|
|
|
+max_size2='50M'
|
|
|
+max_size3='100M'
|
|
|
+max_size4='1G'
|
|
|
+max_size5='5G'
|
|
|
+max_size6='5G'
|
|
|
+sizes = [max_size1, max_size2, max_size3, max_size4, max_size5, max_size6]
|
|
|
+
|
|
|
+if args.size:
|
|
|
+ bytes = humanfriendly.parse_size(args.size)
|
|
|
+else:
|
|
|
+ bytes = 0
|
|
|
+
|
|
|
+def findDup(parentFolder):
|
|
|
+ # Dups in format {hash:[names]}
|
|
|
+ dups = {}
|
|
|
+ print()
|
|
|
+ for dirName, subdirs, fileList in os.walk(parentFolder):
|
|
|
+ print(' Scanning %s...' % dirName)
|
|
|
+ for filename in fileList:
|
|
|
+ # Get the path to the file
|
|
|
+ path = os.path.join(dirName, filename)
|
|
|
+ # Calculate hash
|
|
|
+ if os.path.exists(path):
|
|
|
+ # Calculate hash
|
|
|
+ file_hash = hashfile(path)
|
|
|
+ # Add or append the file path
|
|
|
+ if file_hash in dups:
|
|
|
+ dups[file_hash].append(path)
|
|
|
+ else:
|
|
|
+ dups[file_hash] = [path]
|
|
|
+ return dups
|
|
|
+
|
|
|
+
|
|
|
+# Joins two dictionaries
|
|
|
+def joinDicts(dict1, dict2):
|
|
|
+ for key in dict2.keys():
|
|
|
+ if key in dict1:
|
|
|
+ dict1[key] = dict2[key]
|
|
|
+ else:
|
|
|
+ dict1[key] = dict2[key]
|
|
|
+
|
|
|
+
|
|
|
+def hashfile(path, blocksize = 65536):
|
|
|
+ afile = open(path, 'rb')
|
|
|
+ hasher = hashlib.md5()
|
|
|
+ buf = afile.read(blocksize)
|
|
|
+ while len(buf) > 0:
|
|
|
+ hasher.update(buf)
|
|
|
+ buf = afile.read(blocksize)
|
|
|
+ afile.close()
|
|
|
+ return hasher.hexdigest()
|
|
|
+
|
|
|
+
|
|
|
+def printResults(dict1):
|
|
|
+ final = {max_size1:[], max_size2:[], max_size3:[], max_size4:[], max_size5:[], max_size6:[]}
|
|
|
+ if bytes > 0:
|
|
|
+ final[bytes] = []
|
|
|
+ results = list(filter(lambda x: len(x) > 1, dict1.values()))
|
|
|
+ for result in results:
|
|
|
+ file_size = os.path.getsize(result[0])
|
|
|
+ if bytes > 0:
|
|
|
+ if file_size >= bytes:
|
|
|
+ final[bytes].append(result)
|
|
|
+
|
|
|
+ else:
|
|
|
+ if file_size >= humanfriendly.parse_size(max_size1) and file_size < humanfriendly.parse_size(max_size2):
|
|
|
+ final[max_size2].append(result)
|
|
|
+ #print('1M-50M: '+result)
|
|
|
+ elif file_size >= humanfriendly.parse_size(max_size2) and file_size < humanfriendly.parse_size(max_size3):
|
|
|
+ final[max_size3].append(result)
|
|
|
+ #print('50M-100M: '+result)
|
|
|
+ elif file_size >= humanfriendly.parse_size(max_size3) and file_size < humanfriendly.parse_size(max_size4):
|
|
|
+ #print('100M-1G: '+result)
|
|
|
+ final[max_size4].append(result)
|
|
|
+ elif file_size >= humanfriendly.parse_size(max_size4) and file_size < humanfriendly.parse_size(max_size5):
|
|
|
+ #print('1G-5G: '+result)
|
|
|
+ final[max_size5].append(result)
|
|
|
+ elif file_size >= humanfriendly.parse_size(max_size5):
|
|
|
+ #print('5G+: '+result)
|
|
|
+ final[max_size6].append(result)
|
|
|
+ else:
|
|
|
+ #print('<1M: '+result)
|
|
|
+ final[max_size1].append(result)
|
|
|
+ if len(results) > 0:
|
|
|
+ print('___________________')
|
|
|
+ print('\n\033[1;34m\033[1;34m\u25b6 Duplicates Found\033[0m\n')
|
|
|
+ print(' The following files are identical. The name could differ, but the content is identical')
|
|
|
+ print('___________________')
|
|
|
+ new = ['0']
|
|
|
+ if bytes > 0:
|
|
|
+ print("\n\033[1;34m\u25b6 Files bigger than %s\033[0m" % humanfriendly.format_size(bytes, binary=True))
|
|
|
+ for dupe in final[bytes]:
|
|
|
+ print('___________________\n')
|
|
|
+ for file in dupe:
|
|
|
+ print(' %s' % str(file))
|
|
|
+ print('___________________')
|
|
|
+ else:
|
|
|
+ for size in sizes:
|
|
|
+ new.append(size)
|
|
|
+ if len(final[size]) > 0:
|
|
|
+ print("\n\033[1;34m\u25b6 Between %s and %s\033[0m" % (new[-2],size))
|
|
|
+ for dupe in final[size]:
|
|
|
+ print('___________________\n')
|
|
|
+ for file in dupe:
|
|
|
+ print(' %s' % str(file))
|
|
|
+ print('___________________')
|
|
|
+
|
|
|
+ else:
|
|
|
+ print('No duplicate files found.')
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ if len(sys.argv) > 1:
|
|
|
+ dups = {}
|
|
|
+ folders = args.dir
|
|
|
+ for i in folders:
|
|
|
+ # Iterate the folders given
|
|
|
+ if os.path.exists(i):
|
|
|
+ # Find the duplicated files and append them to the dups
|
|
|
+ joinDicts(dups, findDup(i))
|
|
|
+ else:
|
|
|
+ print('%s is not a valid path, please verify' % i)
|
|
|
+ sys.exit()
|
|
|
+ printResults(dups)
|
|
|
+ else:
|
|
|
+ print('Usage: python dupFinder.py folder or python dupFinder.py folder1 folder2 folder3')
|