6 years ago · a8a3fcde2d
--- a/2.py
+++ b/2.py
@@ -0,0 +1,148 @@
 
				+#!/usr/local/bin/python3
			
 
				+import os, sys
			
 
				+import hashlib
			
 
				+import argparse
			
 
				+import humanfriendly
			
 
				+
			
 
				+parser = argparse.ArgumentParser(description='What the dupe!?')
			
 
				+
			
 
				+optional = parser._action_groups.pop() # Edited this line
			
 
				+required = parser.add_argument_group('required arguments')
			
 
				+
			
 
				+optional.add_argument('--size', type=str,
			
 
				+                    help='Only output files greater than \'size\'. 16, 16K, 16M, 16G, 16T')
			
 
				+
			
 
				+required.add_argument('--dir', type=str, nargs='?', required=True, action='append',
			
 
				+                    help='Directory to scan. Can be used multiple times.')
			
 
				+
			
 
				+parser._action_groups.append(optional)
			
 
				+
			
 
				+args = parser.parse_args()
			
 
				+
			
 
				+max_size1='10M'
			
 
				+max_size2='50M'
			
 
				+max_size3='100M'
			
 
				+max_size4='1G'
			
 
				+max_size5='5G'
			
 
				+max_size6='5G'
			
 
				+sizes = [max_size1, max_size2, max_size3, max_size4, max_size5, max_size6]
			
 
				+
			
 
				+if args.size:
			
 
				+    bytes = humanfriendly.parse_size(args.size)
			
 
				+else:
			
 
				+    bytes = 0
			
 
				+
			
 
				+def findDup(parentFolder):
			
 
				+    # Dups in format {hash:[names]}
			
 
				+    dups = {}
			
 
				+    print()
			
 
				+    for dirName, subdirs, fileList in os.walk(parentFolder):
			
 
				+        print('  Scanning %s...' % dirName)
			
 
				+        for filename in fileList:
			
 
				+            # Get the path to the file
			
 
				+            path = os.path.join(dirName, filename)
			
 
				+            # Calculate hash
			
 
				+            if os.path.exists(path):
			
 
				+                # Calculate hash
			
 
				+                file_hash = hashfile(path)
			
 
				+                # Add or append the file path
			
 
				+                if file_hash in dups:
			
 
				+                    dups[file_hash].append(path)
			
 
				+                else:
			
 
				+                    dups[file_hash] = [path]
			
 
				+    return dups
			
 
				+
			
 
				+
			
 
				+# Joins two dictionaries
			
 
				+def joinDicts(dict1, dict2):
			
 
				+    for key in dict2.keys():
			
 
				+        if key in dict1:
			
 
				+            dict1[key] = dict2[key]
			
 
				+        else:
			
 
				+            dict1[key] = dict2[key]
			
 
				+
			
 
				+
			
 
				+def hashfile(path, blocksize = 65536):
			
 
				+    afile = open(path, 'rb')
			
 
				+    hasher = hashlib.md5()
			
 
				+    buf = afile.read(blocksize)
			
 
				+    while len(buf) > 0:
			
 
				+        hasher.update(buf)
			
 
				+        buf = afile.read(blocksize)
			
 
				+    afile.close()
			
 
				+    return hasher.hexdigest()
			
 
				+
			
 
				+
			
 
				+def printResults(dict1):
			
 
				+    final = {max_size1:[], max_size2:[], max_size3:[], max_size4:[], max_size5:[], max_size6:[]}
			
 
				+    if bytes > 0:
			
 
				+        final[bytes] = []
			
 
				+    results = list(filter(lambda x: len(x) > 1, dict1.values()))
			
 
				+    for result in results:
			
 
				+        file_size = os.path.getsize(result[0])
			
 
				+        if bytes > 0:
			
 
				+            if file_size >= bytes:
			
 
				+                final[bytes].append(result)
			
 
				+
			
 
				+        else:
			
 
				+            if file_size >= humanfriendly.parse_size(max_size1) and file_size < humanfriendly.parse_size(max_size2):
			
 
				+                final[max_size2].append(result)
			
 
				+                #print('1M-50M: '+result)
			
 
				+            elif file_size >= humanfriendly.parse_size(max_size2) and file_size < humanfriendly.parse_size(max_size3):
			
 
				+                final[max_size3].append(result)
			
 
				+                #print('50M-100M: '+result)
			
 
				+            elif file_size >= humanfriendly.parse_size(max_size3) and file_size < humanfriendly.parse_size(max_size4):
			
 
				+                #print('100M-1G: '+result)
			
 
				+                final[max_size4].append(result)
			
 
				+            elif file_size >= humanfriendly.parse_size(max_size4) and file_size < humanfriendly.parse_size(max_size5):
			
 
				+                #print('1G-5G: '+result)
			
 
				+                final[max_size5].append(result)
			
 
				+            elif file_size >= humanfriendly.parse_size(max_size5):
			
 
				+                #print('5G+: '+result)
			
 
				+                final[max_size6].append(result)
			
 
				+            else:
			
 
				+                #print('<1M: '+result)
			
 
				+                final[max_size1].append(result)
			
 
				+    if len(results) > 0:
			
 
				+        print('___________________')
			
 
				+        print('\n\033[1;34m\033[1;34m\u25b6 Duplicates Found\033[0m\n')
			
 
				+        print('  The following files are identical. The name could differ, but the content is identical')
			
 
				+        print('___________________')
			
 
				+        new = ['0']
			
 
				+        if bytes > 0:
			
 
				+             print("\n\033[1;34m\u25b6 Files bigger than %s\033[0m" % humanfriendly.format_size(bytes, binary=True))
			
 
				+             for dupe in final[bytes]:
			
 
				+                 print('___________________\n')
			
 
				+                 for file in dupe:
			
 
				+                     print('  %s' % str(file))
			
 
				+             print('___________________')
			
 
				+        else:
			
 
				+            for size in sizes:
			
 
				+                new.append(size)
			
 
				+                if len(final[size]) > 0:
			
 
				+                    print("\n\033[1;34m\u25b6 Between %s and %s\033[0m" % (new[-2],size))
			
 
				+                    for dupe in final[size]:
			
 
				+                        print('___________________\n')
			
 
				+                        for file in dupe:
			
 
				+                            print('  %s' % str(file))
			
 
				+                    print('___________________')
			
 
				+
			
 
				+    else:
			
 
				+        print('No duplicate files found.')
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    if len(sys.argv) > 1:
			
 
				+        dups = {}
			
 
				+        folders = args.dir
			
 
				+        for i in folders:
			
 
				+            # Iterate the folders given
			
 
				+            if os.path.exists(i):
			
 
				+                # Find the duplicated files and append them to the dups
			
 
				+                joinDicts(dups, findDup(i))
			
 
				+            else:
			
 
				+                print('%s is not a valid path, please verify' % i)
			
 
				+                sys.exit()
			
 
				+        printResults(dups)
			
 
				+    else:
			
 
				+        print('Usage: python dupFinder.py folder or python dupFinder.py folder1 folder2 folder3')