Image manipulation of Tensorflow

meet Image operation of Tensorflow (II)

Here we focus on the train method. A very important point in the training part is how to select samples. If we use triplet loss to train our network structure, there will be a very serious problem, that is, there is a great difference in the number of sample pairs of positive and negative samples. At this time, we will mine difficult samples. The strategy in FaceNet can not be called OHEM or difficult case mining in the strict sense, but it has its core idea. If you want to train our model better, you can optimize the part of sample selection here.

def train(args, sess, dataset, epoch, image_paths_placeholder, labels_placeholder, labels_batch,
          batch_size_placeholder, learning_rate_placeholder, phase_train_placeholder, enqueue_op, input_queue, global_step, 
          embeddings, loss, train_op, summary_op, summary_writer, learning_rate_schedule_file,
          embedding_size, anchor, positive, negative, triplet_loss):
    batch_number = 0
    # Acquisition learning rate
    if args.learning_rate > 0.0:
        lr = args.learning_rate
    else:
        lr = facenet.get_learning_rate_from_file(learning_rate_schedule_file, epoch)
    # Epoch here_ Size corresponds to batch_size data
    # For example, the total sample size is 100000, and each batch_ If the size is 100, then epoch_size
    # 100000 / 100 = 1000
    while batch_number < args.epoch_size:
        # Random sample size from data source
        # During training, the sample size taken out each time is determined by args people_ per_ Batch and args images_ per_ Person
        # args.people_per_batch indicates the current batch_ How many people are there in the group
        # args.images_per_person indicates how many pictures each person has
        image_paths, num_per_class = sample_people(dataset, args.people_per_batch, args.images_per_person)
        
        print('Running forward pass on sampled images: ', end='')
        start_time = time.time()
        # Get the total number of sample pictures
        nrof_examples = args.people_per_batch * args.images_per_person
        labels_array = np.reshape(np.arange(nrof_examples), (-1, 3))
        image_paths_array = np.reshape(np.expand_dims(np.array(image_paths), 1), (-1, 3))
        sess.run(enqueue_op, {image_paths_placeholder: image_paths_array, labels_placeholder: labels_array})
        emb_array = np.zeros((nrof_examples, embedding_size))
        # Get the number of pictures in each batch
        nrof_batches = int(np.ceil(nrof_examples / args.batch_size))
        # For each batch of images, data extraction and feature extraction are carried out to obtain the feature vector and label of this batch
        for i in range(nrof_batches):
            batch_size = min(nrof_examples-i*args.batch_size, args.batch_size)
            emb, lab = sess.run([embeddings, labels_batch], feed_dict={batch_size_placeholder: batch_size, 
                learning_rate_placeholder: lr, phase_train_placeholder: True})
            emb_array[lab, :] = emb
        print('%.3f' % (time.time()-start_time))

        # After getting the eigenvectors and labels of this batch, the loss function is not calculated immediately, but through select_triplets
        # To make a choice and get the target corresponding to the real calculated loss
        print('Selecting suitable triplets for training')
        triplets, nrof_random_negs, nrof_triplets = select_triplets(emb_array, num_per_class, 
            image_paths, args.people_per_batch, args.alpha)
        selection_time = time.time() - start_time
        print('(nrof_random_negs, nrof_triplets) = (%d, %d): time=%.3f seconds' % 
            (nrof_random_negs, nrof_triplets, selection_time))

        # Recalculate the number of pictures in each batch for the selected target
        nrof_batches = int(np.ceil(nrof_triplets*3/args.batch_size))
        triplet_paths = list(itertools.chain(*triplets))
        labels_array = np.reshape(np.arange(len(triplet_paths)),(-1,3))
        triplet_paths_array = np.reshape(np.expand_dims(np.array(triplet_paths), 1), (-1, 3))
        sess.run(enqueue_op, {image_paths_placeholder: triplet_paths_array, labels_placeholder: labels_array})
        nrof_examples = len(triplet_paths)
        train_time = 0
        i = 0
        emb_array = np.zeros((nrof_examples, embedding_size))
        loss_array = np.zeros((nrof_triplets,))
        summary = tf.Summary()
        step = 0
        # Cycle to extract data
        while i < nrof_batches:
            start_time = time.time()
            batch_size = min(nrof_examples-i*args.batch_size, args.batch_size)
            feed_dict = {batch_size_placeholder: batch_size, learning_rate_placeholder: lr, phase_train_placeholder: True}
            # After getting the data, calculate the loss, which is each batch in the output result_ Loss in size
            err, _, step, emb, lab = sess.run([loss, train_op, global_step, embeddings, labels_batch], feed_dict=feed_dict)
            emb_array[lab,:] = emb
            loss_array[i] = err
            duration = time.time() - start_time
            print('Epoch: [%d][%d/%d]\tTime %.3f\tLoss %2.3f' %
                  (epoch, batch_number+1, args.epoch_size, duration, err))
            batch_number += 1
            i += 1
            train_time += duration
            summary.value.add(tag='loss', simple_value=err)
            
        # Add validation loss and accuracy to summary
        #pylint: disable=maybe-no-member
        summary.value.add(tag='time/selection', simple_value=selection_time)
        summary_writer.add_summary(summary, step)
    return step

Therefore, the loss here is not a batch defined in the parameter_ Size is not the loss of the number, but the defined number of individuals and how many pictures each person has. On this basis, the loss corresponding to the sample after sample screening is carried out. Let's take a look at sample filter select_ Implementation of triplets.

def select_triplets(embeddings, nrof_images_per_class, image_paths, people_per_batch, alpha):
    """ Select the triplets for training
    """
    trip_idx = 0
    emb_start_idx = 0
    num_trips = 0
    triplets = []
    
    # VGG Face: Choosing good triplets is crucial and should strike a balance between
    #  selecting informative (i.e. challenging) examples and swamping training with examples that
    #  are too hard. This is achieve by extending each pair (a, p) to a triplet (a, p, n) by sampling
    #  the image n at random, but only between the ones that violate the triplet loss margin. The
    #  latter is a form of hard-negative mining, but it is not as aggressive (and much cheaper) than
    #  choosing the maximally violating example, as often done in structured output learning.
    # How many people are there
    for i in xrange(people_per_batch):
        nrof_images = int(nrof_images_per_class[i])
        # Traverse the picture corresponding to each person
        for j in xrange(1,nrof_images):
            a_idx = emb_start_idx + j - 1
            neg_dists_sqr = np.sum(np.square(embeddings[a_idx] - embeddings), 1)
            # Construct sample pair
            for pair in xrange(j, nrof_images): # For every possible positive pair.
                p_idx = emb_start_idx + pair
                # Construct positive sample pairs for different pictures of the same person
                pos_dist_sqr = np.sum(np.square(embeddings[a_idx]-embeddings[p_idx]))
                # Construct negative sample pairs for different pictures of different people
                neg_dists_sqr[emb_start_idx:emb_start_idx+nrof_images] = np.NaN
                #all_neg = np.where(np.logical_and(neg_dists_sqr-pos_dist_sqr<alpha, pos_dist_sqr<neg_dists_sqr))[0]  # FaceNet selection
                # The positive and negative sample pairs are screened. The screening standard is that the difference between negative sample pairs and positive sample pairs is less than alpha
                # Here, the constraint of triple loss is satisfied. Because the number of negative samples is much larger than that of positive samples, it is necessary to select the samples that meet the requirements
                # If the negative samples have reached the loss constraint, these samples are some samples that have been classified correctly
                # The samples are discarded. We need to select those samples that do not meet the triple loss constraint, so here is a <, which is just the same as loss
                # The calculation formula is the opposite
                all_neg = np.where(neg_dists_sqr-pos_dist_sqr < alpha)[0] # VGG Face selecction
                # Obtain the sample sequence of the final calculated loss
                nrof_random_negs = all_neg.shape[0]
                if nrof_random_negs > 0:
                    rnd_idx = np.random.randint(nrof_random_negs)
                    n_idx = all_neg[rnd_idx]
                    triplets.append((image_paths[a_idx], image_paths[p_idx], image_paths[n_idx]))
                    #print('Triplet %d: (%d, %d, %d), pos_dist=%2.6f, neg_dist=%2.6f (%d, %d, %d, %d, %d)' % 
                    #    (trip_idx, a_idx, p_idx, n_idx, pos_dist_sqr, neg_dists_sqr[n_idx], nrof_random_negs, rnd_idx, i, j, emb_start_idx))
                    trip_idx += 1

                num_trips += 1

        emb_start_idx += nrof_images
    # Then the sequence is disordered
    np.random.shuffle(triplets)
    return triplets, num_trips, len(triplets)

The screening of negative samples here is still too simple and rough. If you want to optimize the network, you can try to modify the selection method of negative samples in the following places

# Construct sample pair
for pair in xrange(j, nrof_images): # For every possible positive pair.
    p_idx = emb_start_idx + pair
    # Construct positive sample pairs for different pictures of the same person
    pos_dist_sqr = np.sum(np.square(embeddings[a_idx]-embeddings[p_idx]))
    # Construct negative sample pairs for different pictures of different people
    neg_dists_sqr[emb_start_idx:emb_start_idx+nrof_images] = np.NaN
    #all_neg = np.where(np.logical_and(neg_dists_sqr-pos_dist_sqr<alpha, pos_dist_sqr<neg_dists_sqr))[0]  # FaceNet selection
    # The positive and negative sample pairs are screened. The screening standard is that the difference between negative sample pairs and positive sample pairs is less than alpha
    # Here, the constraint of triple loss is satisfied. Because the number of negative samples is much larger than that of positive samples, it is necessary to select the samples that meet the requirements
    # If the negative samples have reached the loss constraint, these samples are some samples that have been classified correctly
    # The samples are discarded. We need to select those samples that do not meet the triple loss constraint, so here is a <, which is just the same as loss
    # The calculation formula is the opposite
    all_neg = np.where(neg_dists_sqr-pos_dist_sqr < alpha)[0] # VGG Face selecction

The positive samples are not processed here, but all positive samples are selected. Here, you can also complete the screening of positive samples to complete the optimization of the network. To sum up, for the whole FaceNet, we can optimize three points: one is how to make better screening strategies when screening negative samples and positive samples; Then is the part of data enhancement, how to add more abundant data enhancement strategies to ensure that the robustness of the model will be better; Another is the definition of backbone network. The definition of backbone network only needs to adopt some ready-made network structures, such as SENet, DenseNet and so on, to extract better network features, so as to improve the robustness of the model.

FaceNet model test

There is a validate in the facenet/src directory_ on_ lfw. Py script file can help us test the model. Let's take a look at the code in it. Here we also need to

import tensorflow as tf

Modified into

import tensorflow.compat.v1 as tf

Start with the main() method

def main(args):
  
    with tf.Graph().as_default():
      
        with tf.Session() as sess:
            
            # Read test sample team
            pairs = lfw.read_pairs(os.path.expanduser(args.lfw_pairs))

            # Get test picture path
            paths, actual_issame = lfw.get_paths(os.path.expanduser(args.lfw_dir), pairs)
            # Picture path
            image_paths_placeholder = tf.placeholder(tf.string, shape=(None, 1), name='image_paths')
            # Picture label refers to whether the picture is the same sample or different samples
            labels_placeholder = tf.placeholder(tf.int32, shape=(None, 1), name='labels')
            batch_size_placeholder = tf.placeholder(tf.int32, name='batch_size')
            control_placeholder = tf.placeholder(tf.int32, shape=(None, 1), name='control')
            phase_train_placeholder = tf.placeholder(tf.bool, name='phase_train')
 
            nrof_preprocess_threads = 4
            image_size = (args.image_size, args.image_size)
            # Create a first in first out data queue to read data through the data queue
            eval_input_queue = data_flow_ops.FIFOQueue(capacity=2000000,
                                        dtypes=[tf.string, tf.int32, tf.int32],
                                        shapes=[(1,), (1,), (1,)],
                                        shared_name=None, name=None)
            eval_enqueue_op = eval_input_queue.enqueue_many([image_paths_placeholder, labels_placeholder, control_placeholder], name='eval_enqueue_op')
            image_batch, label_batch = facenet.create_input_pipeline(eval_input_queue, image_size, nrof_preprocess_threads, batch_size_placeholder)
     
            # Loading model
            input_map = {'image_batch': image_batch, 'label_batch': label_batch, 'phase_train': phase_train_placeholder}
            facenet.load_model(args.model, input_map=input_map)

            # The characteristic matrix is obtained by forward calculation
            embeddings = tf.get_default_graph().get_tensor_by_name("embeddings:0")
#              
            coord = tf.train.Coordinator()
            tf.train.start_queue_runners(coord=coord, sess=sess)
            # The measurement function is args distance_ Metric, at this time, the similarity of the feature matrix can be measured
            # Judge the positive and negative of samples according to the similarity between similar samples and different samples, and set a threshold according to
            # Different thresholds are used to judge whether the prediction result of the current sample is the same sample. If the two samples are the same sample,
            # That is, the similarity between the two samples meets a threshold. At this time, we think it is the same sample. If two samples are taken from the same
            # When a person has different faces and the similarity meets the threshold, we think the prediction is correct, otherwise it is a prediction error. For different
            # The similarity of human faces exceeds the threshold. At this time, we think it is not the same human face. If the two samples do come from different sources
            # We can correctly predict that the two categories are not the same through the model. At this time, we also believe that the prediction is correct
            evaluate(sess, eval_enqueue_op, image_paths_placeholder, labels_placeholder, phase_train_placeholder, batch_size_placeholder, control_placeholder,
                embeddings, label_batch, paths, actual_issame, args.lfw_batch_size, args.lfw_nrof_folds, args.distance_metric, args.subtract_mean,
                args.use_flipped_images, args.use_fixed_image_standardization)

Let's take a look at the previous training model

cd /Users/admin/models/facenet/20211211-082127/
ls

Can see

checkpoint
model-20211211-082127.ckpt-1007.data-00000-of-00001
model-20211211-082127.ckpt-1007.index
model-20211211-082127.ckpt-2028.data-00000-of-00001
model-20211211-082127.ckpt-2028.index
model-20211211-082127.ckpt-3040.data-00000-of-00001
model-20211211-082127.ckpt-3040.index
model-20211211-082127.meta

Now let's start the test, enter the facenet folder and run

python src/validate_on_lfw.py /Users/admin/Downloads/lfw_160 /Users/admin/models/facenet/20211211-082127

Here, the first parameter is the image path, and the second parameter is the model path and test log

Runnning forward pass on LFW images
............
Accuracy: 0.85600+-0.01982
Validation rate: 0.15433+-0.03649 @ FAR=0.00100
Area Under Curve (AUC): 0.933
Equal Error Rate (EER): 0.147
2021-12-13 08:15:01.242145: W tensorflow/core/kernels/queue_base.cc:285] _0_FIFOQueueV2: Skipping cancelled dequeue attempt with queue not closed
2021-12-13 08:15:01.242201: W tensorflow/core/kernels/queue_base.cc:285] _0_FIFOQueueV2: Skipping cancelled dequeue attempt with queue not closed
2021-12-13 08:15:01.242231: W tensorflow/core/kernels/queue_base.cc:285] _0_FIFOQueueV2: Skipping cancelled dequeue attempt with queue not closed
2021-12-13 08:15:01.242257: W tensorflow/core/kernels/queue_base.cc:285] _0_FIFOQueueV2: Skipping cancelled dequeue attempt with queue not closed
2021-12-13 08:15:01.242288: W tensorflow/core/kernels/queue_base.cc:285] _0_FIFOQueueV2: Skipping cancelled dequeue attempt with queue not closed
2021-12-13 08:15:01.242315: W tensorflow/core/kernels/queue_base.cc:285] _0_FIFOQueueV2: Skipping cancelled dequeue attempt with queue not closed
2021-12-13 08:15:01.242344: W tensorflow/core/kernels/queue_base.cc:285] _0_FIFOQueueV2: Skipping cancelled dequeue attempt with queue not closed
2021-12-13 08:15:01.242375: W tensorflow/core/kernels/queue_base.cc:285] _0_FIFOQueueV2: Skipping cancelled dequeue attempt with queue not closed

Since I use the same data for training and testing here, in fact, this should not be. However, the training is time-consuming. Forget these for the time being. We can see that the Accuracy obtained from the test is 0.85 and the AUC is 0.93. Of course, if different image data sets are trained and tested separately, its model Accuracy will not be so high. Usually, there is a field dedicated to cross domain learning called openset domain transfer learning, which can improve the model Accuracy of such problems.

Training model to pb file, model solidification

There is a free in the facenet/src directory_ graph. Py script file, here you also need to

import tensorflow as tf

Modified into

import tensorflow.compat.v1 as tf

Let's also look at the main() method

def main(args):
    with tf.Graph().as_default():
        with tf.Session() as sess:
            # Load the model metagraph and checkpoint
            print('Model directory: %s' % args.model_dir)
            # Get the trained model file
            meta_file, ckpt_file = facenet.get_model_filenames(os.path.expanduser(args.model_dir))
            
            print('Metagraph file: %s' % meta_file)
            print('Checkpoint file: %s' % ckpt_file)

            model_dir_exp = os.path.expanduser(args.model_dir)
            saver = tf.train.import_meta_graph(os.path.join(model_dir_exp, meta_file), clear_devices=True)
            tf.get_default_session().run(tf.global_variables_initializer())
            tf.get_default_session().run(tf.local_variables_initializer())
            # Restore model to current session
            saver.restore(tf.get_default_session(), os.path.join(model_dir_exp, ckpt_file))
            
            # Retrieve the protobuf graph definition and fix the batch norm nodes
            input_graph_def = sess.graph.as_graph_def()
            
            # Save the graph model of the current session
            output_graph_def = freeze_graph_def(sess, input_graph_def, 'embeddings,label_batch')

        # Serialize and dump the output graph to the filesystem
        with tf.gfile.GFile(args.output_file, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print("%d ops in the final graph: %s" % (len(output_graph_def.node), args.output_file))

In freeze_ graph_ In the def method, the following code is mainly used to save the model

# Get the list of nodes to save
whitelist_names = []
for node in input_graph_def.node:
    if (node.name.startswith('InceptionResnet') or node.name.startswith('embeddings') or 
            node.name.startswith('image_batch') or node.name.startswith('label_batch') or
            node.name.startswith('phase_train') or node.name.startswith('Logits')):
        whitelist_names.append(node.name)

# Converting variables into constants is the process of solidification
output_graph_def = graph_util.convert_variables_to_constants(
    sess, input_graph_def, output_node_names.split(","),
    variable_names_whitelist=whitelist_names)

Now we also enter the facenet folder and execute

python src/freeze_graph.py /Users/admin/models/facenet/20211211-082127 /Users/admin/models/facenet/20211211-082127/graph.pb

At this point, we enter the / users / admin / Models / scene / 20211211-082127 folder, and you can see the graph Pb this solidified model.

Added by whitchman on Mon, 13 Dec 2021 06:09:23 +0200