rootless
/
what-the-dupe


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
							#!/usr/local/bin/python3
import os, sys
import hashlib
import argparse
import humanfriendly

parser = argparse.ArgumentParser(description='What the dupe!?') 

optional = parser._action_groups.pop() # Edited this line
required = parser.add_argument_group('required arguments')

optional.add_argument('--size', type=str,
                    help='Only output files greater than \'size\'. 16, 16K, 16M, 16G, 16T')

required.add_argument('--dir', type=str, nargs='?', required=True, action='append',
                    help='Directory to scan. Can be used multiple times.')

parser._action_groups.append(optional)
args = parser.parse_args()

if args.size:
    bytes = humanfriendly.parse_size(args.size)
else:
    bytes = 0


def findDup(parentFolder):
    # Dups in format {hash:[names]}
    dups = {}
    for dirName, subdirs, fileList in os.walk(parentFolder):
        print('Scanning %s...' % dirName)
        for filename in fileList:
            # Get the path to the file
            path = os.path.join(dirName, filename)
            # Calculate hash
            if os.path.exists(path):
                file_size = os.path.getsize(path)
                if file_size > bytes:
                    file_hash = hashfile(path)
                    # Add or append the file path
                    all = str(humanfriendly.format_size(file_size, binary=True)) + ' ' + path
                    if file_hash in dups:
                        dups[file_hash].append(all)
                    else:
                        dups[file_hash] = [all]
    return dups
 
 
# Joins two dictionaries
def joinDicts(dict1, dict2):
    for key in dict2.keys():
        if key in dict1:
            dict1[key] = dict1[key] + dict2[key]
        else:
            dict1[key] = dict2[key]
 
 
def hashfile(path, blocksize = 65536):
    afile = open(path, 'rb')
    hasher = hashlib.sha1()
    buf = afile.read(blocksize)
    while len(buf) > 0:
        hasher.update(buf)
        buf = afile.read(blocksize)
    afile.close()
    return hasher.hexdigest()
 
 
def printResults(dict1):
    results = list(filter(lambda x: len(x) > 1, dict1.values()))
    if len(results) > 0:
        print('\n\033[1m  Duplicates Found\033[0m\n')
        print('  The following files are identical. The name could differ, but the content is identical')
        print('______________\n')
        for result in results:
            for subresult in result:
                print('  %s' % subresult)
            print('______________\n')
 
    else:
        if bytes:
            print('No duplicate files bigger than ' + str(humanfriendly.format_size(bytes, binary=True)) + ' found.')
        else:
            print('No duplicate files found.')
 
 
if __name__ == '__main__':
    if len(sys.argv) > 1:
        dups = {}
        folders = args.dir
        for i in folders:
            # Iterate the folders given
            if os.path.exists(i):
                # Find the duplicated files and append them to the dups
                joinDicts(dups, findDup(i))
            else:
                print('%s is not a valid path, please verify' % i)
                sys.exit()
        printResults(dups)
    else:
        print('Usage: python dupFinder.py folder or python dupFinder.py folder1 folder2 folder3')