2.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. #!/usr/local/bin/python3
  2. import os, sys
  3. import hashlib
  4. import argparse
  5. import humanfriendly
  6. parser = argparse.ArgumentParser(description='What the dupe!?')
  7. optional = parser._action_groups.pop() # Edited this line
  8. required = parser.add_argument_group('required arguments')
  9. optional.add_argument('--threshold', type=str,
  10. help='Only output files greater than \'size\', e.g. 100M')
  11. required.add_argument('--dir', type=str, nargs='?', required=True, action='append',
  12. help='Directory to scan. Can be issued multiple times.')
  13. parser._action_groups.append(optional)
  14. args = parser.parse_args()
  15. sizes = ['10M', '50M', '100M', '1G', '5G', 'gt5GB']
  16. if args.threshold:
  17. bytes = humanfriendly.parse_size(args.threshold)
  18. else:
  19. bytes = 0
  20. def findDup(parentFolder):
  21. # Dups in format {hash:[names]}
  22. dups = {}
  23. print()
  24. for dirName, subdirs, fileList in os.walk(parentFolder):
  25. print(' Scanning %s...' % dirName)
  26. for filename in fileList:
  27. # Get the path to the file
  28. path = os.path.join(dirName, filename)
  29. # Calculate hash
  30. if os.path.exists(path):
  31. # Calculate hash
  32. file_hash = hashfile(path)
  33. # Add or append the file path
  34. if file_hash in dups:
  35. dups[file_hash].append(path)
  36. else:
  37. dups[file_hash] = [path]
  38. return dups
  39. # Joins two dictionaries
  40. def joinDicts(dict1, dict2):
  41. for key in dict2.keys():
  42. if key in dict1:
  43. dict1[key] = dict2[key]
  44. else:
  45. dict1[key] = dict2[key]
  46. def hashfile(path, blocksize = 65536):
  47. try:
  48. afile = open(path, 'rb')
  49. hasher = hashlib.sha256()
  50. buf = afile.read(blocksize)
  51. while len(buf) > 0:
  52. hasher.update(buf)
  53. buf = afile.read(blocksize)
  54. afile.close()
  55. return hasher.hexdigest()
  56. except:
  57. pass
  58. def printResults(dict1):
  59. final = {}
  60. for size in sizes:
  61. final[size] = []
  62. del size
  63. if bytes > 0:
  64. final[bytes] = []
  65. results = list(filter(lambda x: len(x) > 1, dict1.values()))
  66. for result in results:
  67. file_size = os.path.getsize(result[0])
  68. if bytes > 0:
  69. if file_size >= bytes:
  70. final[bytes].append(result)
  71. else:
  72. #0=10MB 1=50MB 2=100MB 3=1GB 4=5GB
  73. if file_size >= humanfriendly.parse_size(sizes[0]) and file_size < humanfriendly.parse_size(sizes[1]):
  74. final[sizes[1]].append(result)
  75. elif file_size >= humanfriendly.parse_size(sizes[1]) and file_size < humanfriendly.parse_size(sizes[2]):
  76. final[sizes[2]].append(result)
  77. elif file_size >= humanfriendly.parse_size(sizes[2]) and file_size < humanfriendly.parse_size(sizes[3]):
  78. final[sizes[3]].append(result)
  79. elif file_size >= humanfriendly.parse_size(sizes[3]) and file_size < humanfriendly.parse_size(sizes[4]):
  80. final[sizes[4]].append(result)
  81. elif file_size >= humanfriendly.parse_size(sizes[4]):
  82. final[sizes[5]].append(result)
  83. else:
  84. final[sizes[0]].append(result)
  85. if len(results) > 0 and not bytes:
  86. print('___________________')
  87. print('\n\033[1;34m\033[1;34m\u25b6 Duplicates Found\033[0m\n')
  88. print(' The following files are identical. The name could differ, but the content is identical')
  89. print('___________________')
  90. new = ['0']
  91. if bytes > 0:
  92. print("\n\033[1;34m\u25b6 Files bigger than %s\033[0m" % humanfriendly.format_size(bytes, binary=True))
  93. for dupe in final[bytes]:
  94. print('___________________\n')
  95. for file in dupe:
  96. print(' %s' % str(file))
  97. print('___________________')
  98. else:
  99. for size in sizes:
  100. new.append(size)
  101. if len(final[size]) > 0:
  102. if size == 'gt5GB':
  103. print("\n\033[1;34m\u25b6 >= %s\033[0m" % (new[-2]))
  104. else:
  105. print("\n\033[1;34m\u25b6 %s to %s\033[0m" % (new[-2],size))
  106. for dupe in final[size]:
  107. print('___________________\n')
  108. for file in dupe:
  109. print(' %s' % str(file))
  110. print('___________________')
  111. else:
  112. print('\n\033[1mNo duplicate files found.\033[0m')
  113. if __name__ == '__main__':
  114. if len(sys.argv) > 1:
  115. dups = {}
  116. folders = args.dir
  117. for i in folders:
  118. # Iterate the folders given
  119. if os.path.exists(i):
  120. # Find the duplicated files and append them to the dups
  121. joinDicts(dups, findDup(i))
  122. else:
  123. print('%s is not a valid path, please verify' % i)
  124. sys.exit()
  125. printResults(dups)
  126. else:
  127. print('Usage: python dupFinder.py folder or python dupFinder.py folder1 folder2 folder3')