2.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. #!/usr/local/bin/python3
  2. import os, sys, argparse
  3. import hashlib
  4. import humanfriendly
  5. parser = argparse.ArgumentParser(description='What the dupe!?')
  6. optional = parser._action_groups.pop()
  7. required = parser.add_argument_group('required arguments')
  8. mutuallyexclusive = parser.add_mutually_exclusive_group(required=False)
  9. mutuallyexclusive.add_argument('--threshold', type=str,
  10. help='Only output files greater than \'size\', e.g. 100M')
  11. mutuallyexclusive.add_argument('--sizes', type=str, nargs='+',
  12. help='Only output files greater than \'size\', e.g. 100M')
  13. required.add_argument('--dir', type=str, nargs='?', required=True, action='append',
  14. help='Directory to scan. Can be issued multiple times.')
  15. optional.add_argument('--exclude', type=str, nargs='?', action='append',
  16. help='Only output files greater than \'size\', e.g. 100M')
  17. parser._action_groups.append(optional)
  18. args = parser.parse_args()
  19. if args.sizes:
  20. sizes = args.sizes
  21. sizes.append('larger than '+sizes[-1])
  22. else:
  23. sizes = ['10M', '50M', '100M', '1G', '5G']
  24. sizes.append('larger than '+sizes[-1])
  25. if args.exclude:
  26. exclude=args.exclude
  27. if args.threshold:
  28. threshold = humanfriendly.parse_size(args.threshold)
  29. else:
  30. threshold = 0
  31. def findDup(parentFolder):
  32. # Dups in format {hash:[names]}
  33. dups = {}
  34. for dirName, subdirs, fileList in os.walk(parentFolder):
  35. # remove excluded dirs from list
  36. for exclude in args.exclude:
  37. subdirs[:] = [dn for dn in subdirs if dirName+'/'+dn != exclude]
  38. print(' Scanning %s...' % dirName)
  39. for filename in fileList:
  40. # Get the path to the file
  41. path = os.path.join(dirName, filename)
  42. # Calculate hash
  43. if os.path.exists(path):
  44. # Calculate hash
  45. file_hash = hashfile(path)
  46. # Add or append the file path
  47. if file_hash in dups:
  48. dups[file_hash].append(path)
  49. else:
  50. dups[file_hash] = [path]
  51. return dups
  52. # Joins two dictionaries
  53. def joinDicts(dict1, dict2):
  54. for key in dict2.keys():
  55. if key in dict1:
  56. dict1[key] = dict2[key]
  57. else:
  58. dict1[key] = dict2[key]
  59. def hashfile(path, blocksize = 65536):
  60. file_size = os.path.getsize(path)
  61. # Only hash files larger than threshold (if set)
  62. if threshold == 0 or (threshold > 0 and file_size > threshold):
  63. try:
  64. afile = open(path, 'rb')
  65. hasher = hashlib.md5()
  66. buf = afile.read(blocksize)
  67. while len(buf) > 0:
  68. hasher.update(buf)
  69. buf = afile.read(blocksize)
  70. afile.close()
  71. return hasher.hexdigest()
  72. except:
  73. pass
  74. def printResults(dict1):
  75. final = {}
  76. for size in sizes:
  77. final[size] = []
  78. del size
  79. if threshold > 0:
  80. final[threshold] = []
  81. results = list(filter(lambda x: len(x) > 1, dict1.values()))
  82. for result in results:
  83. file_size = os.path.getsize(result[0])
  84. if not args.threshold or (args.threshold and args.sizes):
  85. count = 0
  86. while count+1 < len(sizes):
  87. try:
  88. if file_size >= humanfriendly.parse_size(sizes[count]) and file_size < humanfriendly.parse_size(sizes[count+1] and result not in final[sizes[count+1]]):
  89. final[sizes[count+1]].append(result)
  90. print('Hi')
  91. except:
  92. print('Hi')
  93. final[sizes[-1]].append(result)
  94. count += 1
  95. if file_size < humanfriendly.parse_size(sizes[0]) and result not in final[sizes[0]]:
  96. print('Hi')
  97. final[sizes[1]].append(result)
  98. else:
  99. print('Hi')
  100. if file_size >= threshold:
  101. final[threshold].append(result)
  102. print(final)
  103. test=[x for x in final if len(final[x]) > 0]
  104. print(test)
  105. if len(results) > 0 and (len(test) > 0):
  106. print('___________________')
  107. print('\n\033[1;34m\033[1;34m\u25b6 Duplicates Found\033[0m\n')
  108. print(' The following files are identical. The name could differ, but the content is identical')
  109. print('___________________')
  110. new = ['0']
  111. if not args.threshold or (args.threshold and args.sizes):
  112. for size in sizes:
  113. new.append(size)
  114. if len(final[size]) > 0:
  115. if size == 'larger than ' + sizes[-2]:
  116. print("\n\033[1;34m\u25b6 >= %s\033[0m" % (new[-2]))
  117. else:
  118. print("\n\033[1;34m\u25b6 %s to %s\033[0m" % (new[-2],size))
  119. for dupe in final[size]:
  120. print('___________________\n')
  121. for file in dupe:
  122. print(' %s' % str(file))
  123. print('___________________')
  124. else:
  125. print("\n\033[1;34m\u25b6 Files bigger than %s\033[0m" % humanfriendly.format_size(threshold, binary=True))
  126. for dupe in final[threshold]:
  127. print('___________________\n')
  128. for file in dupe:
  129. print(' %s' % str(file))
  130. print('___________________')
  131. else:
  132. print('\n\033[1mNo duplicate files found.\033[0m')
  133. if __name__ == '__main__':
  134. dups = {}
  135. folders = args.dir
  136. for i in folders:
  137. # Iterate the folders given
  138. if os.path.exists(i):
  139. # Find the duplicated files and append them to the dups
  140. joinDicts(dups, findDup(i))
  141. else:
  142. print('%s is not a valid path, please verify' % i)
  143. sys.exit()
  144. printResults(dups)