wtd.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. import os, sys
  2. import hashlib
  3. import argparse
  4. import humanfriendly
  5. parser = argparse.ArgumentParser(description='Find dupes')
  6. optional = parser._action_groups.pop() # Edited this line
  7. required = parser.add_argument_group('required arguments')
  8. optional.add_argument('--size', type=str,
  9. help='Only output files greater than \'size\'. 16, 16K, 16M, 16G, 16T')
  10. required.add_argument('--dir', type=str, nargs='?', required=True, action='append',
  11. help='Directory to scan. Can be used multiple times.')
  12. parser._action_groups.append(optional)
  13. args = parser.parse_args()
  14. if args.size:
  15. bytes = humanfriendly.parse_size(args.size)
  16. else:
  17. bytes = 0
  18. def findDup(parentFolder):
  19. # Dups in format {hash:[names]}
  20. dups = {}
  21. for dirName, subdirs, fileList in os.walk(parentFolder):
  22. print('Scanning %s...' % dirName)
  23. for filename in fileList:
  24. # Get the path to the file
  25. path = os.path.join(dirName, filename)
  26. # Calculate hash
  27. if os.path.exists(path):
  28. file_size = os.path.getsize(path)
  29. if file_size > bytes:
  30. file_hash = hashfile(path)
  31. # Add or append the file path
  32. all = str(humanfriendly.format_size(file_size, binary=True)) + ' ' + path
  33. if file_hash in dups:
  34. dups[file_hash].append(all)
  35. else:
  36. dups[file_hash] = [all]
  37. return dups
  38. # Joins two dictionaries
  39. def joinDicts(dict1, dict2):
  40. for key in dict2.keys():
  41. if key in dict1:
  42. dict1[key] = dict1[key] + dict2[key]
  43. else:
  44. dict1[key] = dict2[key]
  45. def hashfile(path, blocksize = 65536):
  46. afile = open(path, 'rb')
  47. hasher = hashlib.sha1()
  48. buf = afile.read(blocksize)
  49. while len(buf) > 0:
  50. hasher.update(buf)
  51. buf = afile.read(blocksize)
  52. afile.close()
  53. return hasher.hexdigest()
  54. def printResults(dict1):
  55. results = list(filter(lambda x: len(x) > 1, dict1.values()))
  56. if len(results) > 0:
  57. print('\n\033[1m Duplicates Found\033[0m\n')
  58. print(' The following files are identical. The name could differ, but the content is identical')
  59. print('______________\n')
  60. for result in results:
  61. for subresult in result:
  62. print(' %s' % subresult)
  63. print('______________\n')
  64. else:
  65. if bytes:
  66. print('No duplicate files bigger than ' + str(humanfriendly.format_size(bytes, binary=True)) + ' found.')
  67. else:
  68. print('No duplicate files found.')
  69. if __name__ == '__main__':
  70. if len(sys.argv) > 1:
  71. dups = {}
  72. folders = args.dir
  73. for i in folders:
  74. # Iterate the folders given
  75. if os.path.exists(i):
  76. # Find the duplicated files and append them to the dups
  77. joinDicts(dups, findDup(i))
  78. else:
  79. print('%s is not a valid path, please verify' % i)
  80. sys.exit()
  81. printResults(dups)
  82. else:
  83. print('Usage: python dupFinder.py folder or python dupFinder.py folder1 folder2 folder3')