meet Image operation of Tensorflow (II)
Here we focus on the train method. A very important point in the training part is how to select samples. If we use triplet loss to train our network structure, there will be a very serious problem, that is, there is a great difference in the number of sample pairs of positive and negative samples. At this time, we will mine difficult samples. The strategy in FaceNet can not be called OHEM or difficult case mining in the strict sense, but it has its core idea. If you want to train our model better, you can optimize the part of sample selection here.
def train(args, sess, dataset, epoch, image_paths_placeholder, labels_placeholder, labels_batch, batch_size_placeholder, learning_rate_placeholder, phase_train_placeholder, enqueue_op, input_queue, global_step, embeddings, loss, train_op, summary_op, summary_writer, learning_rate_schedule_file, embedding_size, anchor, positive, negative, triplet_loss): batch_number = 0 # Acquisition learning rate if args.learning_rate > 0.0: lr = args.learning_rate else: lr = facenet.get_learning_rate_from_file(learning_rate_schedule_file, epoch) # Epoch here_ Size corresponds to batch_size data # For example, the total sample size is 100000, and each batch_ If the size is 100, then epoch_size # 100000 / 100 = 1000 while batch_number < args.epoch_size: # Random sample size from data source # During training, the sample size taken out each time is determined by args people_ per_ Batch and args images_ per_ Person # args.people_per_batch indicates the current batch_ How many people are there in the group # args.images_per_person indicates how many pictures each person has image_paths, num_per_class = sample_people(dataset, args.people_per_batch, args.images_per_person) print('Running forward pass on sampled images: ', end='') start_time = time.time() # Get the total number of sample pictures nrof_examples = args.people_per_batch * args.images_per_person labels_array = np.reshape(np.arange(nrof_examples), (-1, 3)) image_paths_array = np.reshape(np.expand_dims(np.array(image_paths), 1), (-1, 3)) sess.run(enqueue_op, {image_paths_placeholder: image_paths_array, labels_placeholder: labels_array}) emb_array = np.zeros((nrof_examples, embedding_size)) # Get the number of pictures in each batch nrof_batches = int(np.ceil(nrof_examples / args.batch_size)) # For each batch of images, data extraction and feature extraction are carried out to obtain the feature vector and label of this batch for i in range(nrof_batches): batch_size = min(nrof_examples-i*args.batch_size, args.batch_size) emb, lab = sess.run([embeddings, labels_batch], feed_dict={batch_size_placeholder: batch_size, learning_rate_placeholder: lr, phase_train_placeholder: True}) emb_array[lab, :] = emb print('%.3f' % (time.time()-start_time)) # After getting the eigenvectors and labels of this batch, the loss function is not calculated immediately, but through select_triplets # To make a choice and get the target corresponding to the real calculated loss print('Selecting suitable triplets for training') triplets, nrof_random_negs, nrof_triplets = select_triplets(emb_array, num_per_class, image_paths, args.people_per_batch, args.alpha) selection_time = time.time() - start_time print('(nrof_random_negs, nrof_triplets) = (%d, %d): time=%.3f seconds' % (nrof_random_negs, nrof_triplets, selection_time)) # Recalculate the number of pictures in each batch for the selected target nrof_batches = int(np.ceil(nrof_triplets*3/args.batch_size)) triplet_paths = list(itertools.chain(*triplets)) labels_array = np.reshape(np.arange(len(triplet_paths)),(-1,3)) triplet_paths_array = np.reshape(np.expand_dims(np.array(triplet_paths), 1), (-1, 3)) sess.run(enqueue_op, {image_paths_placeholder: triplet_paths_array, labels_placeholder: labels_array}) nrof_examples = len(triplet_paths) train_time = 0 i = 0 emb_array = np.zeros((nrof_examples, embedding_size)) loss_array = np.zeros((nrof_triplets,)) summary = tf.Summary() step = 0 # Cycle to extract data while i < nrof_batches: start_time = time.time() batch_size = min(nrof_examples-i*args.batch_size, args.batch_size) feed_dict = {batch_size_placeholder: batch_size, learning_rate_placeholder: lr, phase_train_placeholder: True} # After getting the data, calculate the loss, which is each batch in the output result_ Loss in size err, _, step, emb, lab = sess.run([loss, train_op, global_step, embeddings, labels_batch], feed_dict=feed_dict) emb_array[lab,:] = emb loss_array[i] = err duration = time.time() - start_time print('Epoch: [%d][%d/%d]\tTime %.3f\tLoss %2.3f' % (epoch, batch_number+1, args.epoch_size, duration, err)) batch_number += 1 i += 1 train_time += duration summary.value.add(tag='loss', simple_value=err) # Add validation loss and accuracy to summary #pylint: disable=maybe-no-member summary.value.add(tag='time/selection', simple_value=selection_time) summary_writer.add_summary(summary, step) return step
Therefore, the loss here is not a batch defined in the parameter_ Size is not the loss of the number, but the defined number of individuals and how many pictures each person has. On this basis, the loss corresponding to the sample after sample screening is carried out. Let's take a look at sample filter select_ Implementation of triplets.
def select_triplets(embeddings, nrof_images_per_class, image_paths, people_per_batch, alpha): """ Select the triplets for training """ trip_idx = 0 emb_start_idx = 0 num_trips = 0 triplets = [] # VGG Face: Choosing good triplets is crucial and should strike a balance between # selecting informative (i.e. challenging) examples and swamping training with examples that # are too hard. This is achieve by extending each pair (a, p) to a triplet (a, p, n) by sampling # the image n at random, but only between the ones that violate the triplet loss margin. The # latter is a form of hard-negative mining, but it is not as aggressive (and much cheaper) than # choosing the maximally violating example, as often done in structured output learning. # How many people are there for i in xrange(people_per_batch): nrof_images = int(nrof_images_per_class[i]) # Traverse the picture corresponding to each person for j in xrange(1,nrof_images): a_idx = emb_start_idx + j - 1 neg_dists_sqr = np.sum(np.square(embeddings[a_idx] - embeddings), 1) # Construct sample pair for pair in xrange(j, nrof_images): # For every possible positive pair. p_idx = emb_start_idx + pair # Construct positive sample pairs for different pictures of the same person pos_dist_sqr = np.sum(np.square(embeddings[a_idx]-embeddings[p_idx])) # Construct negative sample pairs for different pictures of different people neg_dists_sqr[emb_start_idx:emb_start_idx+nrof_images] = np.NaN #all_neg = np.where(np.logical_and(neg_dists_sqr-pos_dist_sqr<alpha, pos_dist_sqr<neg_dists_sqr))[0] # FaceNet selection # The positive and negative sample pairs are screened. The screening standard is that the difference between negative sample pairs and positive sample pairs is less than alpha # Here, the constraint of triple loss is satisfied. Because the number of negative samples is much larger than that of positive samples, it is necessary to select the samples that meet the requirements # If the negative samples have reached the loss constraint, these samples are some samples that have been classified correctly # The samples are discarded. We need to select those samples that do not meet the triple loss constraint, so here is a <, which is just the same as loss # The calculation formula is the opposite all_neg = np.where(neg_dists_sqr-pos_dist_sqr < alpha)[0] # VGG Face selecction # Obtain the sample sequence of the final calculated loss nrof_random_negs = all_neg.shape[0] if nrof_random_negs > 0: rnd_idx = np.random.randint(nrof_random_negs) n_idx = all_neg[rnd_idx] triplets.append((image_paths[a_idx], image_paths[p_idx], image_paths[n_idx])) #print('Triplet %d: (%d, %d, %d), pos_dist=%2.6f, neg_dist=%2.6f (%d, %d, %d, %d, %d)' % # (trip_idx, a_idx, p_idx, n_idx, pos_dist_sqr, neg_dists_sqr[n_idx], nrof_random_negs, rnd_idx, i, j, emb_start_idx)) trip_idx += 1 num_trips += 1 emb_start_idx += nrof_images # Then the sequence is disordered np.random.shuffle(triplets) return triplets, num_trips, len(triplets)
The screening of negative samples here is still too simple and rough. If you want to optimize the network, you can try to modify the selection method of negative samples in the following places
# Construct sample pair for pair in xrange(j, nrof_images): # For every possible positive pair. p_idx = emb_start_idx + pair # Construct positive sample pairs for different pictures of the same person pos_dist_sqr = np.sum(np.square(embeddings[a_idx]-embeddings[p_idx])) # Construct negative sample pairs for different pictures of different people neg_dists_sqr[emb_start_idx:emb_start_idx+nrof_images] = np.NaN #all_neg = np.where(np.logical_and(neg_dists_sqr-pos_dist_sqr<alpha, pos_dist_sqr<neg_dists_sqr))[0] # FaceNet selection # The positive and negative sample pairs are screened. The screening standard is that the difference between negative sample pairs and positive sample pairs is less than alpha # Here, the constraint of triple loss is satisfied. Because the number of negative samples is much larger than that of positive samples, it is necessary to select the samples that meet the requirements # If the negative samples have reached the loss constraint, these samples are some samples that have been classified correctly # The samples are discarded. We need to select those samples that do not meet the triple loss constraint, so here is a <, which is just the same as loss # The calculation formula is the opposite all_neg = np.where(neg_dists_sqr-pos_dist_sqr < alpha)[0] # VGG Face selecction
The positive samples are not processed here, but all positive samples are selected. Here, you can also complete the screening of positive samples to complete the optimization of the network. To sum up, for the whole FaceNet, we can optimize three points: one is how to make better screening strategies when screening negative samples and positive samples; Then is the part of data enhancement, how to add more abundant data enhancement strategies to ensure that the robustness of the model will be better; Another is the definition of backbone network. The definition of backbone network only needs to adopt some ready-made network structures, such as SENet, DenseNet and so on, to extract better network features, so as to improve the robustness of the model.
FaceNet model test
There is a validate in the facenet/src directory_ on_ lfw. Py script file can help us test the model. Let's take a look at the code in it. Here we also need to
import tensorflow as tf
Modified into
import tensorflow.compat.v1 as tf
Start with the main() method
def main(args): with tf.Graph().as_default(): with tf.Session() as sess: # Read test sample team pairs = lfw.read_pairs(os.path.expanduser(args.lfw_pairs)) # Get test picture path paths, actual_issame = lfw.get_paths(os.path.expanduser(args.lfw_dir), pairs) # Picture path image_paths_placeholder = tf.placeholder(tf.string, shape=(None, 1), name='image_paths') # Picture label refers to whether the picture is the same sample or different samples labels_placeholder = tf.placeholder(tf.int32, shape=(None, 1), name='labels') batch_size_placeholder = tf.placeholder(tf.int32, name='batch_size') control_placeholder = tf.placeholder(tf.int32, shape=(None, 1), name='control') phase_train_placeholder = tf.placeholder(tf.bool, name='phase_train') nrof_preprocess_threads = 4 image_size = (args.image_size, args.image_size) # Create a first in first out data queue to read data through the data queue eval_input_queue = data_flow_ops.FIFOQueue(capacity=2000000, dtypes=[tf.string, tf.int32, tf.int32], shapes=[(1,), (1,), (1,)], shared_name=None, name=None) eval_enqueue_op = eval_input_queue.enqueue_many([image_paths_placeholder, labels_placeholder, control_placeholder], name='eval_enqueue_op') image_batch, label_batch = facenet.create_input_pipeline(eval_input_queue, image_size, nrof_preprocess_threads, batch_size_placeholder) # Loading model input_map = {'image_batch': image_batch, 'label_batch': label_batch, 'phase_train': phase_train_placeholder} facenet.load_model(args.model, input_map=input_map) # The characteristic matrix is obtained by forward calculation embeddings = tf.get_default_graph().get_tensor_by_name("embeddings:0") # coord = tf.train.Coordinator() tf.train.start_queue_runners(coord=coord, sess=sess) # The measurement function is args distance_ Metric, at this time, the similarity of the feature matrix can be measured # Judge the positive and negative of samples according to the similarity between similar samples and different samples, and set a threshold according to # Different thresholds are used to judge whether the prediction result of the current sample is the same sample. If the two samples are the same sample, # That is, the similarity between the two samples meets a threshold. At this time, we think it is the same sample. If two samples are taken from the same # When a person has different faces and the similarity meets the threshold, we think the prediction is correct, otherwise it is a prediction error. For different # The similarity of human faces exceeds the threshold. At this time, we think it is not the same human face. If the two samples do come from different sources # We can correctly predict that the two categories are not the same through the model. At this time, we also believe that the prediction is correct evaluate(sess, eval_enqueue_op, image_paths_placeholder, labels_placeholder, phase_train_placeholder, batch_size_placeholder, control_placeholder, embeddings, label_batch, paths, actual_issame, args.lfw_batch_size, args.lfw_nrof_folds, args.distance_metric, args.subtract_mean, args.use_flipped_images, args.use_fixed_image_standardization)
Let's take a look at the previous training model
cd /Users/admin/models/facenet/20211211-082127/ ls
Can see
checkpoint model-20211211-082127.ckpt-1007.data-00000-of-00001 model-20211211-082127.ckpt-1007.index model-20211211-082127.ckpt-2028.data-00000-of-00001 model-20211211-082127.ckpt-2028.index model-20211211-082127.ckpt-3040.data-00000-of-00001 model-20211211-082127.ckpt-3040.index model-20211211-082127.meta
Now let's start the test, enter the facenet folder and run
python src/validate_on_lfw.py /Users/admin/Downloads/lfw_160 /Users/admin/models/facenet/20211211-082127
Here, the first parameter is the image path, and the second parameter is the model path and test log
Runnning forward pass on LFW images ............ Accuracy: 0.85600+-0.01982 Validation rate: 0.15433+-0.03649 @ FAR=0.00100 Area Under Curve (AUC): 0.933 Equal Error Rate (EER): 0.147 2021-12-13 08:15:01.242145: W tensorflow/core/kernels/queue_base.cc:285] _0_FIFOQueueV2: Skipping cancelled dequeue attempt with queue not closed 2021-12-13 08:15:01.242201: W tensorflow/core/kernels/queue_base.cc:285] _0_FIFOQueueV2: Skipping cancelled dequeue attempt with queue not closed 2021-12-13 08:15:01.242231: W tensorflow/core/kernels/queue_base.cc:285] _0_FIFOQueueV2: Skipping cancelled dequeue attempt with queue not closed 2021-12-13 08:15:01.242257: W tensorflow/core/kernels/queue_base.cc:285] _0_FIFOQueueV2: Skipping cancelled dequeue attempt with queue not closed 2021-12-13 08:15:01.242288: W tensorflow/core/kernels/queue_base.cc:285] _0_FIFOQueueV2: Skipping cancelled dequeue attempt with queue not closed 2021-12-13 08:15:01.242315: W tensorflow/core/kernels/queue_base.cc:285] _0_FIFOQueueV2: Skipping cancelled dequeue attempt with queue not closed 2021-12-13 08:15:01.242344: W tensorflow/core/kernels/queue_base.cc:285] _0_FIFOQueueV2: Skipping cancelled dequeue attempt with queue not closed 2021-12-13 08:15:01.242375: W tensorflow/core/kernels/queue_base.cc:285] _0_FIFOQueueV2: Skipping cancelled dequeue attempt with queue not closed
Since I use the same data for training and testing here, in fact, this should not be. However, the training is time-consuming. Forget these for the time being. We can see that the Accuracy obtained from the test is 0.85 and the AUC is 0.93. Of course, if different image data sets are trained and tested separately, its model Accuracy will not be so high. Usually, there is a field dedicated to cross domain learning called openset domain transfer learning, which can improve the model Accuracy of such problems.
Training model to pb file, model solidification
There is a free in the facenet/src directory_ graph. Py script file, here you also need to
import tensorflow as tf
Modified into
import tensorflow.compat.v1 as tf
Let's also look at the main() method
def main(args): with tf.Graph().as_default(): with tf.Session() as sess: # Load the model metagraph and checkpoint print('Model directory: %s' % args.model_dir) # Get the trained model file meta_file, ckpt_file = facenet.get_model_filenames(os.path.expanduser(args.model_dir)) print('Metagraph file: %s' % meta_file) print('Checkpoint file: %s' % ckpt_file) model_dir_exp = os.path.expanduser(args.model_dir) saver = tf.train.import_meta_graph(os.path.join(model_dir_exp, meta_file), clear_devices=True) tf.get_default_session().run(tf.global_variables_initializer()) tf.get_default_session().run(tf.local_variables_initializer()) # Restore model to current session saver.restore(tf.get_default_session(), os.path.join(model_dir_exp, ckpt_file)) # Retrieve the protobuf graph definition and fix the batch norm nodes input_graph_def = sess.graph.as_graph_def() # Save the graph model of the current session output_graph_def = freeze_graph_def(sess, input_graph_def, 'embeddings,label_batch') # Serialize and dump the output graph to the filesystem with tf.gfile.GFile(args.output_file, 'wb') as f: f.write(output_graph_def.SerializeToString()) print("%d ops in the final graph: %s" % (len(output_graph_def.node), args.output_file))
In freeze_ graph_ In the def method, the following code is mainly used to save the model
# Get the list of nodes to save whitelist_names = [] for node in input_graph_def.node: if (node.name.startswith('InceptionResnet') or node.name.startswith('embeddings') or node.name.startswith('image_batch') or node.name.startswith('label_batch') or node.name.startswith('phase_train') or node.name.startswith('Logits')): whitelist_names.append(node.name) # Converting variables into constants is the process of solidification output_graph_def = graph_util.convert_variables_to_constants( sess, input_graph_def, output_node_names.split(","), variable_names_whitelist=whitelist_names)
Now we also enter the facenet folder and execute
python src/freeze_graph.py /Users/admin/models/facenet/20211211-082127 /Users/admin/models/facenet/20211211-082127/graph.pb
At this point, we enter the / users / admin / Models / scene / 20211211-082127 folder, and you can see the graph Pb this solidified model.