wtd.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. #!/usr/local/bin/python3
  2. import os, sys
  3. import hashlib
  4. import argparse
  5. import humanfriendly
  6. parser = argparse.ArgumentParser(description='What the dupe!?')
  7. optional = parser._action_groups.pop() # Edited this line
  8. required = parser.add_argument_group('required arguments')
  9. optional.add_argument('--size', type=str,
  10. help='Only output files greater than \'size\'. 16, 16K, 16M, 16G, 16T')
  11. required.add_argument('--dir', type=str, nargs='?', required=True, action='append',
  12. help='Directory to scan. Can be used multiple times.')
  13. parser._action_groups.append(optional)
  14. args = parser.parse_args()
  15. if args.size:
  16. bytes = humanfriendly.parse_size(args.size)
  17. else:
  18. bytes = 0
  19. def findDup(parentFolder):
  20. # Dups in format {hash:[names]}
  21. dups = {}
  22. for dirName, subdirs, fileList in os.walk(parentFolder):
  23. print('Scanning %s...' % dirName)
  24. for filename in fileList:
  25. # Get the path to the file
  26. path = os.path.join(dirName, filename)
  27. # Calculate hash
  28. if os.path.exists(path):
  29. file_size = os.path.getsize(path)
  30. if file_size > bytes:
  31. file_hash = hashfile(path)
  32. # Add or append the file path
  33. all = str(humanfriendly.format_size(file_size, binary=True)) + ' ' + path
  34. if file_hash in dups:
  35. dups[file_hash].append(all)
  36. else:
  37. dups[file_hash] = [all]
  38. return dups
  39. # Joins two dictionaries
  40. def joinDicts(dict1, dict2):
  41. for key in dict2.keys():
  42. if key in dict1:
  43. dict1[key] = dict1[key] + dict2[key]
  44. else:
  45. dict1[key] = dict2[key]
  46. def hashfile(path, blocksize = 65536):
  47. afile = open(path, 'rb')
  48. hasher = hashlib.sha1()
  49. buf = afile.read(blocksize)
  50. while len(buf) > 0:
  51. hasher.update(buf)
  52. buf = afile.read(blocksize)
  53. afile.close()
  54. return hasher.hexdigest()
  55. def printResults(dict1):
  56. results = list(filter(lambda x: len(x) > 1, dict1.values()))
  57. if len(results) > 0:
  58. print('\n\033[1m Duplicates Found\033[0m\n')
  59. print(' The following files are identical. The name could differ, but the content is identical')
  60. print('______________\n')
  61. for result in results:
  62. for subresult in result:
  63. print(' %s' % subresult)
  64. print('______________\n')
  65. else:
  66. if bytes:
  67. print('No duplicate files bigger than ' + str(humanfriendly.format_size(bytes, binary=True)) + ' found.')
  68. else:
  69. print('No duplicate files found.')
  70. if __name__ == '__main__':
  71. if len(sys.argv) > 1:
  72. dups = {}
  73. folders = args.dir
  74. for i in folders:
  75. # Iterate the folders given
  76. if os.path.exists(i):
  77. # Find the duplicated files and append them to the dups
  78. joinDicts(dups, findDup(i))
  79. else:
  80. print('%s is not a valid path, please verify' % i)
  81. sys.exit()
  82. printResults(dups)
  83. else:
  84. print('Usage: python dupFinder.py folder or python dupFinder.py folder1 folder2 folder3')