2.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. #!/usr/local/bin/python3
  2. import os, sys
  3. import hashlib
  4. import argparse
  5. import humanfriendly
  6. parser = argparse.ArgumentParser(description='What the dupe!?')
  7. optional = parser._action_groups.pop() # Edited this line
  8. required = parser.add_argument_group('required arguments')
  9. optional.add_argument('--size', type=str,
  10. help='Only output files greater than \'size\'. 16, 16K, 16M, 16G, 16T')
  11. required.add_argument('--dir', type=str, nargs='?', required=True, action='append',
  12. help='Directory to scan. Can be used multiple times.')
  13. parser._action_groups.append(optional)
  14. args = parser.parse_args()
  15. max_size1='10M'
  16. max_size2='50M'
  17. max_size3='100M'
  18. max_size4='1G'
  19. max_size5='5G'
  20. max_size6='5G'
  21. sizes = [max_size1, max_size2, max_size3, max_size4, max_size5, max_size6]
  22. if args.size:
  23. bytes = humanfriendly.parse_size(args.size)
  24. else:
  25. bytes = 0
  26. def findDup(parentFolder):
  27. # Dups in format {hash:[names]}
  28. dups = {}
  29. print()
  30. for dirName, subdirs, fileList in os.walk(parentFolder):
  31. print(' Scanning %s...' % dirName)
  32. for filename in fileList:
  33. # Get the path to the file
  34. path = os.path.join(dirName, filename)
  35. # Calculate hash
  36. if os.path.exists(path):
  37. # Calculate hash
  38. file_hash = hashfile(path)
  39. # Add or append the file path
  40. if file_hash in dups:
  41. dups[file_hash].append(path)
  42. else:
  43. dups[file_hash] = [path]
  44. return dups
  45. # Joins two dictionaries
  46. def joinDicts(dict1, dict2):
  47. for key in dict2.keys():
  48. if key in dict1:
  49. dict1[key] = dict2[key]
  50. else:
  51. dict1[key] = dict2[key]
  52. def hashfile(path, blocksize = 65536):
  53. afile = open(path, 'rb')
  54. hasher = hashlib.md5()
  55. buf = afile.read(blocksize)
  56. while len(buf) > 0:
  57. hasher.update(buf)
  58. buf = afile.read(blocksize)
  59. afile.close()
  60. return hasher.hexdigest()
  61. def printResults(dict1):
  62. final = {max_size1:[], max_size2:[], max_size3:[], max_size4:[], max_size5:[], max_size6:[]}
  63. if bytes > 0:
  64. final[bytes] = []
  65. results = list(filter(lambda x: len(x) > 1, dict1.values()))
  66. for result in results:
  67. file_size = os.path.getsize(result[0])
  68. if bytes > 0:
  69. if file_size >= bytes:
  70. final[bytes].append(result)
  71. else:
  72. if file_size >= humanfriendly.parse_size(max_size1) and file_size < humanfriendly.parse_size(max_size2):
  73. final[max_size2].append(result)
  74. #print('1M-50M: '+result)
  75. elif file_size >= humanfriendly.parse_size(max_size2) and file_size < humanfriendly.parse_size(max_size3):
  76. final[max_size3].append(result)
  77. #print('50M-100M: '+result)
  78. elif file_size >= humanfriendly.parse_size(max_size3) and file_size < humanfriendly.parse_size(max_size4):
  79. #print('100M-1G: '+result)
  80. final[max_size4].append(result)
  81. elif file_size >= humanfriendly.parse_size(max_size4) and file_size < humanfriendly.parse_size(max_size5):
  82. #print('1G-5G: '+result)
  83. final[max_size5].append(result)
  84. elif file_size >= humanfriendly.parse_size(max_size5):
  85. #print('5G+: '+result)
  86. final[max_size6].append(result)
  87. else:
  88. #print('<1M: '+result)
  89. final[max_size1].append(result)
  90. if len(results) > 0:
  91. print('___________________')
  92. print('\n\033[1;34m\033[1;34m\u25b6 Duplicates Found\033[0m\n')
  93. print(' The following files are identical. The name could differ, but the content is identical')
  94. print('___________________')
  95. new = ['0']
  96. if bytes > 0:
  97. print("\n\033[1;34m\u25b6 Files bigger than %s\033[0m" % humanfriendly.format_size(bytes, binary=True))
  98. for dupe in final[bytes]:
  99. print('___________________\n')
  100. for file in dupe:
  101. print(' %s' % str(file))
  102. print('___________________')
  103. else:
  104. for size in sizes:
  105. new.append(size)
  106. if len(final[size]) > 0:
  107. print("\n\033[1;34m\u25b6 Between %s and %s\033[0m" % (new[-2],size))
  108. for dupe in final[size]:
  109. print('___________________\n')
  110. for file in dupe:
  111. print(' %s' % str(file))
  112. print('___________________')
  113. else:
  114. print('No duplicate files found.')
  115. if __name__ == '__main__':
  116. if len(sys.argv) > 1:
  117. dups = {}
  118. folders = args.dir
  119. for i in folders:
  120. # Iterate the folders given
  121. if os.path.exists(i):
  122. # Find the duplicated files and append them to the dups
  123. joinDicts(dups, findDup(i))
  124. else:
  125. print('%s is not a valid path, please verify' % i)
  126. sys.exit()
  127. printResults(dups)
  128. else:
  129. print('Usage: python dupFinder.py folder or python dupFinder.py folder1 folder2 folder3')