From e0fa8c071ebfaf249ff13b2e3002adb9f5ca6715 Mon Sep 17 00:00:00 2001
From: AbysmalBiscuit <velykoivanenko.lev@gmail.com>
Date: Wed, 21 Mar 2018 11:12:12 +0100
Subject: [PATCH] Adding tools.py as main script for using tools, as well as
 integrating all feature requests from #255 and #278 (#298)

* Add tools.py command and control script for use as the main interface for various tools. The structure and approach is the same as faceswap.py
Add many new features to tools/sort.py: various new sorting methods, grouping by folders, logging file renaming/movemeng, keeping original files in the input directory and improved cli options documentation. Argument parsing has been re-written to inteface with tools.py
Add __init__.py empty file in tools directory for python to register it as a module so that sort.py and future tools can be easily imported.

* Fix various bugs where the correct sorting method would not get called.
Add new sorting methon: face-cnn-dissim.
Update help documentation for face-cnn-dissim.
Change default grouping to rename.
Update initial print in all sorting/grouping methods to say precisely which method is being used.

* Major refactor and redesign.
Use dynamic method allocation to avoid large amounts of nested if-elif statements in process() function and to allow easily combine sort and group methods.

Change cli arguments to make them more intuitive and work with the new design.
Previous: '-g/--grouping' -> '-f/--final-processing' {folders,rename}
Previous: '-by/--by' -> '-s/--sort-by' {blur,face,face-cnn,face-cnn-dissim,face-dissim,hist,hist-dissim}
New: '-g/--group-by' {blur,face,face-cnn,hist}
Add: '--logfile' -> '-lg/--logfile' PATH_TO_LOGFILE

Greatly improve grouping performance.
Grouping now has to sort using one of the sorting methods which makes the grouping stable and no longer dependent on how well the the target files are already sorted.
Sorting and grouping methods can be combined in any way. If no -g/--group-by is specified by user, it will default to group by the non '-dissim' version of sort method.
Different combinations of sorting and grouping methods work well for different sets of data.

Fixes
Fix progress updates not showing properly by setting them to print to stdout instead of stderror.
Fix bug in grouping by face-cnn where wrong score method was being called.

Misc
Add documentation for reload_list() and splice_lists() methods because it's not obvious what they do.
Add warning message to tools.py to tell users to make sure they understand how the tool they want to use works before using it.
Add warning message to tools/sort.py to tell users to make sure they undrestand how the sort tool works before using it.
Update help documentation to reflect new functionality and options.
Set defaults for group by face-cnn to work properly with the correct score method.
Amend commit in order to sign it.

* Perform unittests for all options and combinations of sort and group methods: everything OK.
Fix typos in help documentation.
---
 tools.py          |  32 ++
 tools/__init__.py |   0
 tools/sort.py     | 793 +++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 739 insertions(+), 86 deletions(-)
 create mode 100755 tools.py
 create mode 100644 tools/__init__.py

diff --git a/tools.py b/tools.py
new file mode 100755
index 0000000..87bdfa8
--- /dev/null
+++ b/tools.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+import sys
+from lib.cli import FullHelpArgumentParser
+# Importing the various tools
+from tools.sort import SortProcessor
+
+# Python version check
+if sys.version_info[0] < 3:
+    raise Exception("This program requires at least python3.2")
+if sys.version_info[0] == 3 and sys.version_info[1] < 2:
+    raise Exception("This program requires at least python3.2")
+
+
+def bad_args(args):
+    parser.print_help()
+    exit(0)
+
+
+if __name__ == "__main__":
+    _tools_warning = "Please backup your data and/or test the tool you want "
+    _tools_warning += "to use with a smaller data set to make sure you "
+    _tools_warning += "understand how it works."
+    print(_tools_warning)
+
+    parser = FullHelpArgumentParser()
+    subparser = parser.add_subparsers()
+    sort = SortProcessor(
+        subparser, "sort", "This command lets you sort images using various "
+                           "methods.")
+    parser.set_defaults(func=bad_args)
+    arguments = parser.parse_args()
+    arguments.func(arguments)
diff --git a/tools/__init__.py b/tools/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tools/sort.py b/tools/sort.py
index 3eb8619..7c20f48 100644
--- a/tools/sort.py
+++ b/tools/sort.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import argparse
 import os
 import sys
@@ -6,85 +7,240 @@ import numpy as np
 import cv2
 from tqdm import tqdm
 import face_recognition
+from shutil import copyfile
+import json
+import re
+
 
 if sys.version_info[0] < 3:
     raise Exception("This program requires at least python3.2")
 if sys.version_info[0] == 3 and sys.version_info[1] < 2:
     raise Exception("This program requires at least python3.2")
 
+
 class SortProcessor(object):
+    def __init__(self, subparser, command, description='default'):
+        self.arguments = None
+        self.changes = None
+        self.parse_arguments(description, subparser, command)
 
-    def __init__(self, parser):
-        self.init_parser_arguments(parser)
-        
-    def process_arguments(self, arguments):
-        self.arguments = arguments
-        self.process()
+    def parse_arguments(self, description, subparser, command):
+        parser = subparser.add_parser(
+                command,
+                help="This command lets you sort images using various methods."
+                     " Please backup your data and/or test this tool with a "
+                     "smaller data set to make sure you understand how it"
+                     "works.",
+                description=description,
+                epilog="Questions and feedback: \
+                        https://github.com/deepfakes/faceswap-playground"
+        )
 
-    def init_parser_arguments(self, parser):
         parser.add_argument('-i', '--input',
                             dest="input_dir",
                             default="input_dir",
                             help="Input directory of aligned faces.",
                             required=True)
-                             
-        parser.add_argument('-by', '--by',
+
+        parser.add_argument('-o', '--output',
+                            dest="output_dir",
+                            default="__default",
+                            help="Output directory for sorted aligned faces.")
+
+        parser.add_argument('-f', '--final-process',
                             type=str,
-                            choices=("blur", "hist", "face"),
-                            dest='method',
+                            choices=("folders", "rename"),
+                            dest='final_process',
+                            default="rename",
+                            help="'folders': files are sorted using the "
+                                 "-s/--sort-by method, then they are "
+                                 "organized into folders using the "
+                                 "-g/--group-by grouping method. "
+                                 "'rename': files are sorted using the "
+                                 "-s/--sort-by then they are renamed."
+                                 "Default: rename")
+
+        parser.add_argument('-t', '--ref_threshold',
+                            type=float,
+                            dest='min_threshold',
+                            default=-1.0,
+                            help="Float value. "
+                                 "Minimum threshold to use for grouping "
+                                 "comparison with 'face' and 'hist' methods. "
+                                 "The lower the value the more discriminating "
+                                 "the grouping is. "
+                                 "For face 0.6 should be enough, with 0.5 "
+                                 "being very discriminating. "
+                                 "For face-cnn 7.2 should be enough, with 4 "
+                                 "being very discriminating. "
+                                 "For hist 0.3 should be enough, with 0.2 "
+                                 "being very discriminating. "
+                                 "Be careful setting a value that's too "
+                                 "low in a directory with many images, as "
+                                 "this could result in a lot of directories "
+                                 " being created. "
+                                 "Defaults: face 0.6, face-cnn 7.2, hist 0.3")
+
+        parser.add_argument('-b', '--bins',
+                            type=int,
+                            dest='num_bins',
+                            default=5,
+                            help="Integer value. "
+                                 "Number of folders that will be used to " 
+                                 "group by blur. Folder 0 will be the least "
+                                 "blurry, while the last folder will be the "
+                                 "blurriest. If the number of images doesn't "
+                                 "divide evenly into the number of bins, the "
+                                 "remaining images get put in the last bin as "
+                                 "they will be the blurriest by definition. "
+                                 "Default value: 5")
+
+        parser.add_argument('-k', '--keep',
+                            action='store_true',
+                            dest='keep_original',
+                            default=False,
+                            help="Keeps the original files in the input "
+                                 "directory. Be careful when using this with "
+                                 "rename grouping and no specified output "
+                                 "directory as this would keep the original "
+                                 "and renamed files in the same directory.")
+
+        parser.add_argument('-l', '--log-changes',
+                            action='store_true',
+                            dest='log_changes',
+                            default=False,
+                            help="Logs file renaming changes if grouping by "
+                                 "renaming, or it logs the file "
+                                 "copying/movement if grouping by folders. "
+                                 "If no log file is specified with "
+                                 "'--log-file', then a 'sort_log.json' file "
+                                 "will be created in the input directory.")
+
+        parser.add_argument('-lf', '--log-file',
+                            dest='log_file',
+                            default='__default',
+                            help="Specify a log file to use for saving the "
+                                 "renaming or grouping information. "
+                                 "Default: sort_log.json")
+
+        parser.add_argument('-s', '--sort-by',
+                            type=str,
+                            choices=("blur", "face", "face-cnn",
+                                     "face-cnn-dissim", "face-dissim", "hist",
+                                     "hist-dissim"),
+                            dest='sort_method',
                             default="hist",
-                            help="Sort by method.")
+                            help="Sort by method. "
+                                 "Choose how images are sorted. "
+                                 "Default: hist")
 
-    def process(self):        
-        if self.arguments.method.lower() == 'blur':
-            self.process_blur()
-        elif self.arguments.method.lower() == 'hist':
-            self.process_hist()
-        elif self.arguments.method.lower() == 'face':
-            self.process_face()
-            
-    def process_blur(self):
-        input_dir = self.arguments.input_dir
-        
-        print ("Sorting by blur...")         
-        img_list = [ [x, self.estimate_blur(cv2.imread(x))] for x in tqdm(self.find_images(input_dir), desc="Loading") ]
-        print ("Sorting...")    
-        img_list = sorted(img_list, key=operator.itemgetter(1), reverse=True) 
-        self.process_final_rename(input_dir, img_list)        
-        print ("Done.")
-  
-    def process_hist(self):
-        input_dir = self.arguments.input_dir
-        
-        print ("Sorting by histogram similarity...")
-        
-        img_list = [ [x, cv2.calcHist([cv2.imread(x)], [0], None, [256], [0, 256]) ] for x in tqdm( self.find_images(input_dir), desc="Loading") ]
+        parser.add_argument('-g', '--group-by',
+                            type=str,
+                            choices=("blur", "face", "face-cnn", "hist"),
+                            dest='group_method',
+                            default="__default",
+                            help="Group by method. "
+                                 "When -fp/--final-processing by folders "
+                                 "choose the how the images are grouped after "
+                                 "sorting. "
+                                 "Default: non-dissim version of "
+                                 "-s/--sort-by method")
+
+        parser = self.add_optional_arguments(parser)
+        parser.set_defaults(func=self.process_arguments)
+
+    def add_optional_arguments(self, parser):
+        # Override this for custom arguments
+        return parser
+
+    def process_arguments(self, arguments):
+        self.arguments = arguments
+
+        # Setting default argument values that cannot be set by argparse
+
+        # Set output dir to the same value as input dir
+        # if the user didn't specify it.
+        if self.arguments.output_dir.lower() == "__default":
+            self.arguments.output_dir = self.arguments.input_dir
+
+        # Set final_process to group if folders was chosen
+        if self.arguments.final_process.lower() == "folders":
+            self.arguments.final_process = "group"
+
+        # Assign default group_method if not set by user
+        if self.arguments.group_method == '__default':
+            self.arguments.group_method = self.arguments.sort_method.replace('-dissim', '')
+
+        # Assigning default threshold values based on grouping method
+        if self.arguments.min_threshold == -1.0 and self.arguments.final_process == "group":
+            method = self.arguments.group_method.lower()
+            if method == 'face':
+                self.arguments.min_threshold = 0.6
+            elif method == 'face-cnn':
+                self.arguments.min_threshold = 7.2
+            elif method == 'hist':
+                self.arguments.min_threshold = 0.3
+
+        # If logging is enabled, prepare container
+        if self.arguments.log_changes:
+            self.changes = dict()
+
+        # Assign default sort_log.json value if user didn't specify one
+        if self.arguments.log_file.lower() == '__default':
+            self.arguments.log_file = os.path.join(self.arguments.input_dir, 'sort_log.json')
+
+        self.process()
+
+    def process(self):
+        """
+        This method dynamically assigns the functions that will be used to run
+        the core process of sorting, optionally grouping, renaming/moving into
+        folders. After the functions are assigned they are executed.
+        """
+        __sort_method = self.arguments.sort_method.lower()
+        __group_method = self.arguments.group_method.lower()
+        final_process = self.arguments.final_process.lower()
+
+        # Assign the methods that will be used for processing the files
+        sort_method = self.set_process_method("sort", __sort_method)
+        group_method = self.set_process_method("group", __group_method)
+        final_method = self.set_process_method("final_process", final_process)
+
+        img_list = getattr(self, sort_method)()
+        if "group" in final_process:
+            # Check if non-dissim sort method and group method are not the same
+            if __sort_method.replace('-dissim', '') != __group_method:
+                img_list = self.reload_images(group_method, img_list)
+                img_list = getattr(self, group_method)(img_list)
+            else:
+                img_list = getattr(self, group_method)(img_list)
+
+        getattr(self, final_method)(img_list)
 
-        img_list_len = len(img_list)
-        for i in tqdm ( range(0, img_list_len-1), desc="Sorting"):
-            min_score = 9999.9
-            j_min_score = i+1
-            for j in range(i+1,len(img_list)):
-                score = cv2.compareHist(img_list[i][1], img_list[j][1], cv2.HISTCMP_BHATTACHARYYA)
-                if score < min_score:
-                    min_score = score
-                    j_min_score = j            
-            img_list[i+1], img_list[j_min_score] = img_list[j_min_score], img_list[i+1]
-            
-        self.process_final_rename (input_dir, img_list)
-                
         print ("Done.")
-        
-    def process_face(self):
+
+    # Methods for sorting
+    def sort_blur(self):
         input_dir = self.arguments.input_dir
-        
+
+        print ("Sorting by blur...")
+        img_list = [ [x, self.estimate_blur(cv2.imread(x))] for x in tqdm(self.find_images(input_dir), desc="Loading", file=sys.stdout) ]
+        print ("Sorting...")
+
+        img_list = sorted(img_list, key=operator.itemgetter(1), reverse=True)
+
+        return img_list
+
+    def sort_face(self):
+        input_dir = self.arguments.input_dir
+
         print ("Sorting by face similarity...")
         
-        img_list = [ [x, face_recognition.face_encodings(cv2.imread(x)) ] for x in tqdm( self.find_images(input_dir), desc="Loading") ]
+        img_list = [ [x, face_recognition.face_encodings(cv2.imread(x)) ] for x in tqdm( self.find_images(input_dir), desc="Loading", file=sys.stdout) ]
 
         img_list_len = len(img_list)
-        for i in tqdm ( range(0, img_list_len-1), desc="Sorting"):
-            min_score = 9999.9
+        for i in tqdm ( range(0, img_list_len-1), desc="Sorting", file=sys.stdout):
+            min_score = float("inf")
             j_min_score = i+1
             for j in range(i+1,len(img_list)):
             
@@ -93,40 +249,415 @@ class SortProcessor(object):
                 if f1encs is not None and f2encs is not None and len(f1encs) > 0 and len(f2encs) > 0:
                     score = face_recognition.face_distance(f1encs[0], f2encs)[0]
                 else: 
-                    score = 9999.9
+                    score = float("inf")
                 
                 if score < min_score:
                     min_score = score
                     j_min_score = j            
             img_list[i+1], img_list[j_min_score] = img_list[j_min_score], img_list[i+1]
             
-        self.process_final_rename (input_dir, img_list)
-                
-        print ("Done.")
-        
-    def process_final_rename(self, input_dir, img_list):
-        for i in tqdm( range(0,len(img_list)), desc="Renaming" , leave=False):
-            src = img_list[i][0]
-            src_basename = os.path.basename(src)       
+        return img_list
 
-            dst = os.path.join (input_dir, '%.5d_%s' % (i, src_basename ) )
-            try:
-                os.rename (src, dst)
-            except:
-                print ('fail to rename %s' % (src) )    
-                
-        for i in tqdm( range(0,len(img_list)) , desc="Renaming" ):
+    def sort_face_dissim(self):
+        input_dir = self.arguments.input_dir
+
+        print ("Sorting by face dissimilarity...")
+
+        img_list = [ [x, face_recognition.face_encodings(cv2.imread(x)), 0 ] for x in tqdm( self.find_images(input_dir), desc="Loading", file=sys.stdout) ]
+
+        img_list_len = len(img_list)
+        for i in tqdm ( range(0, img_list_len), desc="Sorting", file=sys.stdout):
+            score_total = 0
+            for j in range( 0, img_list_len):
+                if i == j:
+                    continue
+                try:
+                    score_total += face_recognition.face_distance([img_list[i][1]], [img_list[j][1]])
+                except:
+                    pass
+
+            img_list[i][2] = score_total
+
+
+        print ("Sorting...")
+        img_list = sorted(img_list, key=operator.itemgetter(2), reverse=True)
+        return img_list
+
+    def sort_face_cnn(self):
+        from lib import FaceLandmarksExtractor
+
+        input_dir = self.arguments.input_dir
+
+        print ("Sorting by face-cnn similarity...")
+
+        img_list = []
+        for x in tqdm( self.find_images(input_dir), desc="Loading", file=sys.stdout):
+            d = FaceLandmarksExtractor.extract(cv2.imread(x), 'cnn', True)
+            img_list.append( [x, np.array(d[0][1]) if len(d) > 0 else np.zeros ( (68,2) ) ] )
+
+        img_list_len = len(img_list)
+        for i in tqdm ( range(0, img_list_len-1), desc="Sorting", file=sys.stdout):
+            min_score = float("inf")
+            j_min_score = i+1
+            for j in range(i+1,len(img_list)):
+
+                fl1 = img_list[i][1]
+                fl2 = img_list[j][1]
+                score = np.sum ( np.absolute ( (fl2 - fl1).flatten() ) )
+
+                if score < min_score:
+                    min_score = score
+                    j_min_score = j
+            img_list[i+1], img_list[j_min_score] = img_list[j_min_score], img_list[i+1]
+
+        return img_list
+
+    def sort_face_cnn_dissim(self):
+        from lib import FaceLandmarksExtractor
+
+        input_dir = self.arguments.input_dir
+
+        print ("Sorting by face-cnn dissimilarity...")
+
+        img_list = []
+        for x in tqdm( self.find_images(input_dir), desc="Loading", file=sys.stdout):
+            d = FaceLandmarksExtractor.extract(cv2.imread(x), 'cnn', True)
+            img_list.append( [x, np.array(d[0][1]) if len(d) > 0 else np.zeros ( (68,2) ), 0 ] )
+
+        img_list_len = len(img_list)
+        for i in tqdm( range(0, img_list_len-1), desc="Sorting", file=sys.stdout):
+            score_total = 0
+            for j in range(i+1,len(img_list)):
+                if i == j:
+                    continue
+                fl1 = img_list[i][1]
+                fl2 = img_list[j][1]
+                score_total += np.sum ( np.absolute ( (fl2 - fl1).flatten() ) )
+
+            img_list[i][2] = score_total
+
+        print ("Sorting...")
+        img_list = sorted(img_list, key=operator.itemgetter(2), reverse=True)
+
+        return img_list
+
+    def sort_hist(self):
+        input_dir = self.arguments.input_dir
+
+        print ("Sorting by histogram similarity...")
+
+        img_list = [ [x, cv2.calcHist([cv2.imread(x)], [0], None, [256], [0, 256]) ] for x in tqdm( self.find_images(input_dir), desc="Loading", file=sys.stdout) ]
+
+        img_list_len = len(img_list)
+        for i in tqdm( range(0, img_list_len-1), desc="Sorting", file=sys.stdout):
+            min_score = float("inf")
+            j_min_score = i+1
+            for j in range(i+1,len(img_list)):
+                score = cv2.compareHist(img_list[i][1], img_list[j][1], cv2.HISTCMP_BHATTACHARYYA)
+                if score < min_score:
+                    min_score = score
+                    j_min_score = j
+            img_list[i+1], img_list[j_min_score] = img_list[j_min_score], img_list[i+1]
+
+        return img_list
+
+    def sort_hist_dissim(self):
+        input_dir = self.arguments.input_dir
+
+        print ("Sorting by histogram dissimilarity...")
+
+        img_list = [ [x, cv2.calcHist([cv2.imread(x)], [0], None, [256], [0, 256]), 0] for x in tqdm( self.find_images(input_dir), desc="Loading", file=sys.stdout) ]
+
+        img_list_len = len(img_list)
+        for i in tqdm ( range(0, img_list_len), desc="Sorting", file=sys.stdout):
+            score_total = 0
+            for j in range( 0, img_list_len):
+                if i == j:
+                    continue
+                score_total += cv2.compareHist(img_list[i][1], img_list[j][1], cv2.HISTCMP_BHATTACHARYYA)
+
+            img_list[i][2] = score_total
+
+
+        print ("Sorting...")
+        img_list = sorted(img_list, key=operator.itemgetter(2), reverse=True)
+
+        return img_list
+
+    # Methods for grouping
+    def group_blur(self, img_list):
+        # Starting the binning process
+        num_bins = self.arguments.num_bins
+
+        # The last bin will get all extra images if it's
+        # not possible to distribute them evenly
+        num_per_bin = len(img_list) // num_bins
+        remainder = len(img_list) % num_bins
+
+        print ("Grouping by blur...")
+        bins = [ [] for _ in range(num_bins) ]
+        image_index = 0
+        for i in range(num_bins):
+            for j in range(num_per_bin):
+                bins[i].append(img_list[image_index][0])
+                image_index += 1
+
+        # If remainder is 0, nothing gets added to the last bin.
+        for i in range(1, remainder + 1):
+            bins[-1].append(img_list[-i][0])
+
+        return bins
+
+    def group_face(self, img_list):
+        print ("Grouping by face similarity...")
+
+        # Groups are of the form: group_num -> reference face
+        reference_groups = dict()
+
+        # Bins array, where index is the group number and value is
+        # an array containing the file paths to the images in that group.
+        # The first group (0), is always the non-face group.
+        bins = [[]]
+
+        # Comparison threshold used to decide how similar
+        # faces have to be to be grouped together.
+        min_threshold = self.arguments.min_threshold
+
+        img_list_len = len(img_list)
+
+        for i in tqdm(range(1, img_list_len), desc="Grouping", file=sys.stdout):
+            f1encs = img_list[i][1]
+
+            # Check if current image is a face, if not then
+            # add it immediately to the non-face list.
+            if f1encs is None or len(f1encs) <= 0:
+                bins[0].append(img_list[i][0])
+
+            else:
+                current_best = [-1, float("inf")]
+
+                for key, references in reference_groups.items():
+                    # Non-faces are not added to reference_groups dict, thus
+                    # removing the need to check that f2encs is a face.
+                    # The try-catch block is to handle the first face that gets
+                    # processed, as the first value is None.
+                    try:
+                        score = self.get_avg_score_faces(f1encs, references)
+                    except TypeError:
+                        score = float("inf")
+                    except ZeroDivisionError:
+                        score = float("inf")
+                    if score < current_best[1]:
+                        current_best[0], current_best[1] = key, score
+
+                if current_best[1] < min_threshold:
+                    reference_groups[current_best[0]].append(f1encs[0])
+                    bins[current_best[0]].append(img_list[i][0])
+                else:
+                    reference_groups[len(reference_groups)] = img_list[i][1]
+                    bins.append([img_list[i][0]])
+
+        return bins
+
+    def group_face_cnn(self, img_list):
+        print ("Grouping by face-cnn similarity...")
+
+        # Groups are of the form: group_num -> reference faces
+        reference_groups = dict()
+
+        # Bins array, where index is the group number and value is
+        # an array containing the file paths to the images in that group.
+        bins = []
+
+        # Comparison threshold used to decide how similar
+        # faces have to be to be grouped together.
+        # It is multiplied by 1000 here to allow the cli option to use smaller
+        # numbers.
+        min_threshold = self.arguments.min_threshold * 1000
+
+        img_list_len = len(img_list)
+
+        for i in tqdm ( range(0, img_list_len - 1), desc="Grouping", file=sys.stdout):
+            fl1 = img_list[i][1]
+
+            current_best = [-1, float("inf")]
+
+            for key, references in reference_groups.items():
+                try:
+                    score = self.get_avg_score_faces_cnn(fl1, references)
+                except TypeError:
+                    score = float("inf")
+                except ZeroDivisionError:
+                    score = float("inf")
+                if score < current_best[1]:
+                    current_best[0], current_best[1] = key, score
+
+            if current_best[1] < min_threshold:
+                reference_groups[current_best[0]].append(fl1[0])
+                bins[current_best[0]].append(img_list[i][0])
+            else:
+                reference_groups[len(reference_groups)] = [img_list[i][1]]
+                bins.append([img_list[i][0]])
+
+        return bins
+
+    def group_hist(self, img_list):
+        print ("Grouping by histogram...")
+
+        # Groups are of the form: group_num -> reference histogram
+        reference_groups = dict()
+
+        # Bins array, where index is the group number and value is
+        # an array containing the file paths to the images in that group
+        bins = []
+
+        min_threshold = self.arguments.min_threshold
+
+        img_list_len = len(img_list)
+        reference_groups[0] = [img_list[0][1]]
+        bins.append([img_list[0][0]])
+
+        for i in tqdm(range(1, img_list_len), desc="Grouping", file=sys.stdout):
+            current_best = [-1, float("inf")]
+            for key, value in reference_groups.items():
+                score = self.get_avg_score_hist(img_list[i][1], value)
+                if score < current_best[1]:
+                    current_best[0], current_best[1] = key, score
+
+            if current_best[1] < min_threshold:
+                reference_groups[current_best[0]].append(img_list[i][1])
+                bins[current_best[0]].append(img_list[i][0])
+            else:
+                reference_groups[len(reference_groups)] = [img_list[i][1]]
+                bins.append([img_list[i][0]])
+
+        return bins
+
+    # Final process methods
+    def final_process_rename(self, img_list):
+        output_dir = self.arguments.output_dir
+
+        process_file = self.set_process_file_method(self.arguments.log_changes, self.arguments.keep_original)
+
+        # Make sure output directory exists
+        if not os.path.exists (output_dir):
+            os.makedirs (output_dir)
+
+        description = ("Copying and Renaming" if self.arguments.keep_original else "Moving and Renaming")
+
+        for i in tqdm(range(0, len(img_list)), desc=description, leave=False, file=sys.stdout):
             src = img_list[i][0]
             src_basename = os.path.basename(src)
-            
-            src = os.path.join (input_dir, '%.5d_%s' % (i, src_basename) )
-            dst = os.path.join (input_dir, '%.5d%s' % (i, os.path.splitext(src_basename)[1] ) )
+
+            dst = os.path.join (output_dir, '%.5d_%s' % (i, src_basename ) )
+            try:
+                process_file (src, dst, self.changes)
+            except FileNotFoundError as e:
+                print(e)
+                print ('fail to rename %s' % (src) )
+
+        for i in tqdm( range(0,len(img_list)) , desc=description, file=sys.stdout):
+            renaming = self.set_renaming_method(self.arguments.log_changes)
+            src, dst = renaming(img_list[i][0], output_dir, i, self.changes)
+
             try:
                 os.rename (src, dst)
-            except:
+            except FileNotFoundError as e:
+                print(e)
                 print ('fail to rename %s' % (src) )
-                
-    def find_images(self, input_dir):
+
+        if self.arguments.log_changes:
+            self.write_to_log(self.arguments.log_file, self.changes)
+
+    def final_process_group(self, bins):
+        output_dir = self.arguments.output_dir
+
+        process_file = self.set_process_file_method(self.arguments.log_changes, self.arguments.keep_original)
+
+        # First create new directories to avoid checking
+        # for directory existence in the moving loop
+        print ("Creating group directories.")
+        for i in range(len(bins)):
+            directory = os.path.join (output_dir, str(i))
+            if not os.path.exists (directory):
+                os.makedirs (directory)
+
+        description = ("Copying into Groups" if self.arguments.keep_original else "Moving into Groups")
+
+        print ("Total groups found: {}".format(len(bins)))
+        for i in tqdm(range(len(bins)), desc=description, file=sys.stdout):
+            for j in range(len(bins[i])):
+                src = bins[i][j]
+                src_basename = os.path.basename (src)
+
+                dst = os.path.join (output_dir, str(i), src_basename)
+                try:
+                    process_file (src, dst, self.changes)
+                except FileNotFoundError as e:
+                    print (e)
+                    print ('Failed to move {0} to {1}'.format(src, dst))
+
+        if self.arguments.log_changes:
+            self.write_to_log(self.arguments.log_file, self.changes)
+
+    # Various helper methods
+    def reload_images(self, group_method, img_list):
+        """
+        Reloads the image list by replacing the comparative values with those
+        that the chosen grouping method expects.
+        :param group_method: str name of the grouping method that will be used.
+        :param img_list: image list that has been sorted by one of the sort
+        methods.
+        :return: img_list but with the comparative values that the chosen
+        grouping method expects.
+        """
+        input_dir = self.arguments.input_dir
+        print("Preparing to group...")
+        if group_method == 'group_blur':
+            temp_list = [[x, self.estimate_blur(cv2.imread(x))] for x in tqdm(self.find_images(input_dir), desc="Reloading", file=sys.stdout)]
+        elif group_method == 'group_face':
+            temp_list = [[x, face_recognition.face_encodings(cv2.imread(x))] for x in tqdm(self.find_images(input_dir), desc="Reloading", file=sys.stdout)]
+        elif group_method == 'group_face_cnn':
+            from lib import FaceLandmarksExtractor
+            temp_list = []
+            for x in tqdm(self.find_images(input_dir), desc="Reloading", file=sys.stdout):
+                d = FaceLandmarksExtractor.extract(cv2.imread(x), 'cnn', True)
+                temp_list.append([x, np.array(d[0][1]) if len(d) > 0 else np.zeros((68, 2))])
+        elif group_method == 'group_hist':
+            temp_list = [[x, cv2.calcHist([cv2.imread(x)], [0], None, [256], [0, 256])] for x in tqdm(self.find_images(input_dir), desc="Reloading", file=sys.stdout)]
+        else:
+            raise ValueError("{} group_method not found.".format(group_method))
+
+        return self.splice_lists(img_list, temp_list)
+
+    @staticmethod
+    def splice_lists(sorted_list, new_vals_list):
+        """
+        This method replaces the value at index 1 in each sub-list in the
+        sorted_list with the value that is calculated for the same img_path,
+        but found in new_vals_list.
+
+        Format of lists: [[img_path, value], [img_path2, value2], ...]
+
+        :param sorted_list: list that has been sorted by one of the sort
+        methods.
+        :param new_vals_list: list that has been loaded by a different method
+        than the sorted_list.
+        :return: list that is sorted in the same way as the input sorted list
+        but the values corresponding to each image are from new_vals_list.
+        """
+        new_list = []
+        # Make new list of just image paths to serve as an index
+        val_index_list = [i[0] for i in new_vals_list]
+        for i in tqdm(range(len(sorted_list)), desc="Splicing", file=sys.stdout):
+            current_image = sorted_list[i][0]
+            new_val_index = val_index_list.index(current_image)
+            new_list.append([current_image, new_vals_list[new_val_index][1]])
+
+        return new_list
+
+    @staticmethod
+    def find_images(input_dir):
         result = []
         extensions = [".jpg", ".png", ".jpeg"]
         for root, dirs, files in os.walk(input_dir):
@@ -135,28 +666,118 @@ class SortProcessor(object):
                     result.append (os.path.join(root, file))
         return result
 
-    def estimate_blur(self, image):
+    @staticmethod
+    def estimate_blur(image):
         if image.ndim == 3:
             image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
 
         blur_map = cv2.Laplacian(image, cv2.CV_64F)
         score = np.var(blur_map)
         return score
-        
-    def error(self, message):
-        self.print_help(sys.stderr)
-        args = {'prog': self.prog, 'message': message}
-        self.exit(2, '%(prog)s: error: %(message)s\n' % args)    
+
+    @staticmethod
+    def set_process_method(prefix, method):
+        _method = re.sub(r'-', r'_', method)
+        return prefix + "_" + _method
+
+    @staticmethod
+    def set_process_file_method(log_changes, keep_original):
+        """
+        Assigns the final file processing method based on whether changes are
+        being logged and whether the original files are being kept in the
+        input directory.
+        Relevant cli arguments: -k, -l
+        :return: function reference
+        """
+        if log_changes:
+            if keep_original:
+                def process_file(src, dst, changes):
+                    copyfile(src, dst)
+                    changes[src] = dst
+                return process_file
+            else:
+                def process_file(src, dst, changes):
+                    os.rename(src, dst)
+                    changes[src] = dst
+                return process_file
+        else:
+            if keep_original:
+                def process_file(src, dst, changes):
+                    copyfile(src, dst)
+                return process_file
+            else:
+                def process_file(src, dst, changes):
+                    os.rename(src, dst)
+                return process_file
+
+    @staticmethod
+    def set_renaming_method(log_changes):
+        if log_changes:
+            def renaming(src, output_dir, i, changes):
+                src_basename = os.path.basename(src)
+
+                __src = os.path.join (output_dir, '%.5d_%s' % (i, src_basename) )
+                dst = os.path.join (output_dir, '%.5d%s' % (i, os.path.splitext(src_basename)[1] ) )
+                changes[src] = dst
+                return __src, dst
+            return renaming
+
+        else:
+            def renaming(src, output_dir, i, changes):
+                src_basename = os.path.basename(src)
+
+                src = os.path.join (output_dir, '%.5d_%s' % (i, src_basename) )
+                dst = os.path.join (output_dir, '%.5d%s' % (i, os.path.splitext(src_basename)[1] ) )
+                return src, dst
+            return renaming
+
+    @staticmethod
+    def get_avg_score_hist(img1, references):
+        scores = []
+        for img2 in references:
+            score = cv2.compareHist(img1, img2, cv2.HISTCMP_BHATTACHARYYA)
+            scores.append(score)
+        return sum(scores)/len(scores)
+
+    @staticmethod
+    def get_avg_score_faces(f1encs, references):
+        scores = []
+        for f2encs in references:
+            score = face_recognition.face_distance(f1encs, f2encs)[0]
+            scores.append(score)
+        return sum(scores)/len(scores)
+
+    @staticmethod
+    def get_avg_score_faces_cnn(fl1, references):
+        scores = []
+        for fl2 in references:
+            score = np.sum ( np.absolute ( (fl2 - fl1).flatten() ) )
+            scores.append(score)
+        return sum(scores)/len(scores)
+
+    @staticmethod
+    def write_to_log(log_file, changes):
+        with open(log_file, 'w') as lf:
+            json.dump(changes, lf, sort_keys=True, indent=4)
+
 
 def bad_args(args):
     parser.print_help()
     exit(0)
 
+
 if __name__ == "__main__":
+    __warning_string = "Important: face-cnn method will cause an error when "
+    __warning_string += "this tool is called directly instead of through the "
+    __warning_string += "tools.py command script."
+    print (__warning_string)
     print ("Images sort tool.\n")
     
     parser = argparse.ArgumentParser()
+    subparser = parser.add_subparsers()
+    sort = SortProcessor(
+            subparser, "sort", "Sort images using various methods.")
+
     parser.set_defaults(func=bad_args)
-    
-    sort = SortProcessor(parser)    
-    sort.process_arguments(parser.parse_args())
\ No newline at end of file
+    arguments = parser.parse_args()
+    arguments.func(arguments)