rootless
/
what-the-dupe


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
							#!/usr/local/bin/python3
import os, sys
import hashlib
import argparse
import humanfriendly

parser = argparse.ArgumentParser(description='What the dupe!?')

optional = parser._action_groups.pop()
required = parser.add_argument_group('required arguments')

optional.add_argument('--threshold', type=str,
                    help='Only output files greater than \'size\', e.g. 100M')

optional.add_argument('--exclude', type=str, nargs='?', action='append',
                    help='Only output files greater than \'size\', e.g. 100M')

optional.add_argument('--sizes', type=str, nargs='+',
                    help='Only output files greater than \'size\', e.g. 100M')

required.add_argument('--dir', type=str, nargs='?', required=True, action='append',
                    help='Directory to scan. Can be issued multiple times.')

parser._action_groups.append(optional)

args = parser.parse_args()

if args.sizes:
    sizes = args.sizes
    sizes.append('larger than '+sizes[-1])
else:
    sizes = ['10M', '50M', '100M', '1G', '5G']
    sizes.append('larger than '+sizes[-1])

if args.exclude:
    exclude=args.exclude

if args.threshold:
    threshold = humanfriendly.parse_size(args.threshold)
else:
    threshold = 0

def findDup(parentFolder):
    # Dups in format {hash:[names]}
    dups = {}
    for dirName, subdirs, fileList in os.walk(parentFolder):
        # remove excluded dirs from list
        for exclude in args.exclude:
            subdirs[:] = [dn for dn in subdirs if dirName+'/'+dn != exclude]
        print('  Scanning %s...' % dirName)
        for filename in fileList:
            # Get the path to the file
            path = os.path.join(dirName, filename)
            # Calculate hash
            if os.path.exists(path):
                # Calculate hash
                file_hash = hashfile(path)
                # Add or append the file path
                if file_hash in dups:
                    dups[file_hash].append(path)
                else:
                    dups[file_hash] = [path]
    return dups


# Joins two dictionaries
def joinDicts(dict1, dict2):
    for key in dict2.keys():
        if key in dict1:
            dict1[key] = dict2[key]
        else:
            dict1[key] = dict2[key]


def hashfile(path, blocksize = 65536):
    file_size = os.path.getsize(path)
    # Only hash files larger than threshold (if set)
    if threshold == 0 or (threshold > 0 and file_size > threshold):
        try:
            afile = open(path, 'rb')
            hasher = hashlib.md5()
            buf = afile.read(blocksize)
            while len(buf) > 0:
                hasher.update(buf)
                buf = afile.read(blocksize)
            afile.close()
            return hasher.hexdigest()
        except:
            pass


def printResults(dict1):
    final = {}
    for size in sizes:
        final[size] = []
    del size
    if threshold > 0:
        final[threshold] = []
    results = list(filter(lambda x: len(x) > 1, dict1.values()))
    for result in results:
        file_size = os.path.getsize(result[0])
        if threshold > 0:
            if file_size >= threshold:
                final[threshold].append(result)

        else:
            count = 0
            while count+1 < len(sizes):
                try:
                    if file_size >= humanfriendly.parse_size(sizes[count]) and file_size < humanfriendly.parse_size(sizes[count+1]):
                        final[sizes[count+1]].append(result)
                except:
                    final[sizes[-1]].append(result)
                count += 1
            if file_size < humanfriendly.parse_size(sizes[0]):
                final[sizes[0]].append(result)
            final[threshold]=[False]
    if len(results) > 0 and len(final[threshold]) > 0:
        print('___________________')
        print('\n\033[1;34m\033[1;34m\u25b6 Duplicates Found\033[0m\n')
        print('  The following files are identical. The name could differ, but the content is identical')
        print('___________________')
        new = ['0']
        if threshold > 0:
             print("\n\033[1;34m\u25b6 Files bigger than %s\033[0m" % humanfriendly.format_size(threshold, binary=True))
             for dupe in final[threshold]:
                 print('___________________\n')
                 for file in dupe:
                     print('  %s' % str(file))
             print('___________________')
        else:
            for size in sizes:
                new.append(size)
                if len(final[size]) > 0:
                    if size == 'larger than ' + sizes[-2]:
                        print("\n\033[1;34m\u25b6 >= %s\033[0m" % (new[-2]))
                    else:
                        print("\n\033[1;34m\u25b6 %s to %s\033[0m" % (new[-2],size))
                    for dupe in final[size]:
                        print('___________________\n')
                        for file in dupe:
                            print('  %s' % str(file))
                    print('___________________')

    else:
        print('\n\033[1mNo duplicate files found.\033[0m')

if __name__ == '__main__':
    if len(sys.argv) > 1:
        dups = {}
        folders = args.dir
        for i in folders:
            # Iterate the folders given
            if os.path.exists(i):
                # Find the duplicated files and append them to the dups
                joinDicts(dups, findDup(i))
            else:
                print('%s is not a valid path, please verify' % i)
                sys.exit()
        printResults(dups)
    else:
        print('Usage: python dupFinder.py folder or python dupFinder.py folder1 folder2 folder3')