|
@@ -6,12 +6,15 @@ import humanfriendly
|
|
|
|
|
|
parser = argparse.ArgumentParser(description='What the dupe!?')
|
|
|
|
|
|
-optional = parser._action_groups.pop() # Edited this line
|
|
|
+optional = parser._action_groups.pop()
|
|
|
required = parser.add_argument_group('required arguments')
|
|
|
|
|
|
optional.add_argument('--threshold', type=str,
|
|
|
help='Only output files greater than \'size\', e.g. 100M')
|
|
|
|
|
|
+optional.add_argument('--exclude', type=str, nargs='?', action='append',
|
|
|
+ help='Only output files greater than \'size\', e.g. 100M')
|
|
|
+
|
|
|
required.add_argument('--dir', type=str, nargs='?', required=True, action='append',
|
|
|
help='Directory to scan. Can be issued multiple times.')
|
|
|
|
|
@@ -21,29 +24,37 @@ args = parser.parse_args()
|
|
|
|
|
|
sizes = ['10M', '50M', '100M', '1G', '5G', 'gt5GB']
|
|
|
|
|
|
+if args.exclude:
|
|
|
+ print('hi')
|
|
|
+ exclude=args.exclude
|
|
|
+ print(exclude)
|
|
|
+
|
|
|
if args.threshold:
|
|
|
- bytes = humanfriendly.parse_size(args.threshold)
|
|
|
+ threshold = humanfriendly.parse_size(args.threshold)
|
|
|
else:
|
|
|
- bytes = 0
|
|
|
+ threshold = 0
|
|
|
|
|
|
def findDup(parentFolder):
|
|
|
# Dups in format {hash:[names]}
|
|
|
dups = {}
|
|
|
print()
|
|
|
for dirName, subdirs, fileList in os.walk(parentFolder):
|
|
|
- print(' Scanning %s...' % dirName)
|
|
|
- for filename in fileList:
|
|
|
- # Get the path to the file
|
|
|
- path = os.path.join(dirName, filename)
|
|
|
- # Calculate hash
|
|
|
- if os.path.exists(path):
|
|
|
+ if args.exclude and dirName in args.exclude:
|
|
|
+ continue
|
|
|
+ else:
|
|
|
+ print(' Scanning %s...' % dirName)
|
|
|
+ for filename in fileList:
|
|
|
+ # Get the path to the file
|
|
|
+ path = os.path.join(dirName, filename)
|
|
|
# Calculate hash
|
|
|
- file_hash = hashfile(path)
|
|
|
- # Add or append the file path
|
|
|
- if file_hash in dups:
|
|
|
- dups[file_hash].append(path)
|
|
|
- else:
|
|
|
- dups[file_hash] = [path]
|
|
|
+ if os.path.exists(path):
|
|
|
+ # Calculate hash
|
|
|
+ file_hash = hashfile(path)
|
|
|
+ # Add or append the file path
|
|
|
+ if file_hash in dups:
|
|
|
+ dups[file_hash].append(path)
|
|
|
+ else:
|
|
|
+ dups[file_hash] = [path]
|
|
|
return dups
|
|
|
|
|
|
|
|
@@ -57,17 +68,21 @@ def joinDicts(dict1, dict2):
|
|
|
|
|
|
|
|
|
def hashfile(path, blocksize = 65536):
|
|
|
- try:
|
|
|
- afile = open(path, 'rb')
|
|
|
- hasher = hashlib.sha256()
|
|
|
- buf = afile.read(blocksize)
|
|
|
- while len(buf) > 0:
|
|
|
- hasher.update(buf)
|
|
|
+ file_size = os.path.getsize(path)
|
|
|
+ # Only hash files larger than threshold
|
|
|
+ if threshold == 0 or (threshold > 0 and file_size > threshold):
|
|
|
+ try:
|
|
|
+ print('Hashing '+path)
|
|
|
+ afile = open(path, 'rb')
|
|
|
+ hasher = hashlib.sha256()
|
|
|
buf = afile.read(blocksize)
|
|
|
- afile.close()
|
|
|
- return hasher.hexdigest()
|
|
|
- except:
|
|
|
- pass
|
|
|
+ while len(buf) > 0:
|
|
|
+ hasher.update(buf)
|
|
|
+ buf = afile.read(blocksize)
|
|
|
+ afile.close()
|
|
|
+ return hasher.hexdigest()
|
|
|
+ except:
|
|
|
+ pass
|
|
|
|
|
|
|
|
|
def printResults(dict1):
|
|
@@ -75,14 +90,14 @@ def printResults(dict1):
|
|
|
for size in sizes:
|
|
|
final[size] = []
|
|
|
del size
|
|
|
- if bytes > 0:
|
|
|
- final[bytes] = []
|
|
|
+ if threshold > 0:
|
|
|
+ final[threshold] = []
|
|
|
results = list(filter(lambda x: len(x) > 1, dict1.values()))
|
|
|
for result in results:
|
|
|
file_size = os.path.getsize(result[0])
|
|
|
- if bytes > 0:
|
|
|
- if file_size >= bytes:
|
|
|
- final[bytes].append(result)
|
|
|
+ if threshold > 0:
|
|
|
+ if file_size >= threshold:
|
|
|
+ final[threshold].append(result)
|
|
|
|
|
|
else:
|
|
|
#0=10MB 1=50MB 2=100MB 3=1GB 4=5GB
|
|
@@ -98,15 +113,16 @@ def printResults(dict1):
|
|
|
final[sizes[5]].append(result)
|
|
|
else:
|
|
|
final[sizes[0]].append(result)
|
|
|
- if len(results) > 0 and not bytes:
|
|
|
+ final[threshold]=[False]
|
|
|
+ if len(results) > 0 and len(final[threshold]) > 0:
|
|
|
print('___________________')
|
|
|
print('\n\033[1;34m\033[1;34m\u25b6 Duplicates Found\033[0m\n')
|
|
|
print(' The following files are identical. The name could differ, but the content is identical')
|
|
|
print('___________________')
|
|
|
new = ['0']
|
|
|
- if bytes > 0:
|
|
|
- print("\n\033[1;34m\u25b6 Files bigger than %s\033[0m" % humanfriendly.format_size(bytes, binary=True))
|
|
|
- for dupe in final[bytes]:
|
|
|
+ if threshold > 0:
|
|
|
+ print("\n\033[1;34m\u25b6 Files bigger than %s\033[0m" % humanfriendly.format_size(threshold, binary=True))
|
|
|
+ for dupe in final[threshold]:
|
|
|
print('___________________\n')
|
|
|
for file in dupe:
|
|
|
print(' %s' % str(file))
|