variable publicas modificar globales def declarar python tensorflow tensorflow-gpu

publicas - variable def python



Recree la capa final de InceptionV4 para nuevas categorías: variable local no inicializada (2)

Todavía soy novato en tensorflow así que lo siento si esta es una pregunta ingenua. Estoy tratando de usar el modelo inception_V4 ImageNet conjunto de datos ImageNet publicado en este sitio . Además, estoy usando su red tal como es, me refiero a la que se publica en su sitio .

Así es como llamo a la red:

def network(images_op, keep_prob): width_needed_InceptionV4Net = 342 shape = images_op.get_shape().as_list() H = int(round(width_needed_InceptionV4Net * shape[1] / shape[2], 2)) resized_images = tf.image.resize_images(images_op, [width_needed_InceptionV4Net, H], tf.image.ResizeMethod.BILINEAR) with slim.arg_scope(inception.inception_v4_arg_scope()): logits, _ = inception.inception_v4(resized_images, num_classes=20, is_training=True, dropout_keep_prob = keep_prob) return logits

Como necesito volver a entrenar la capa final de Inception_V4 para mis categorías, modifiqué el número de clases a 20, como puede ver en la llamada al método ( inception.inception_v4 ).

Este es el método de tren que tengo hasta ahora:

def optimistic_restore(session, save_file, flags): reader = tf.train.NewCheckpointReader(save_file) saved_shapes = reader.get_variable_to_shape_map() var_names = sorted([(var.name, var.name.split('':'')[0]) for var in tf.global_variables() if var.name.split('':'')[0] in saved_shapes]) restore_vars = [] name2var = dict(zip(map(lambda x:x.name.split('':'')[0], tf.global_variables()), tf.global_variables())) if flags.checkpoint_exclude_scopes is not None: exclusions = [scope.strip() for scope in flags.checkpoint_exclude_scopes.split('','')] with tf.variable_scope('''', reuse=True): variables_to_init = [] for var_name, saved_var_name in var_names: curr_var = name2var[saved_var_name] var_shape = curr_var.get_shape().as_list() if var_shape == saved_shapes[saved_var_name]: print(saved_var_name) excluded = False for exclusion in exclusions: if saved_var_name.startswith(exclusion): variables_to_init.append(var) excluded = True break if not excluded: restore_vars.append(curr_var) saver = tf.train.Saver(restore_vars) saver.restore(session, save_file) def train(images, ids, labels, total_num_examples, batch_size, train_dir, network, flags, optimizer, log_periods, resume): """!@brief Trains the network for a number of steps. @param images image tensor @param ids id tensor @param labels label tensor @param total_num_examples total number of training examples @param batch_size batch size @param train_dir directory where checkpoints should be saved @param network pointer to a function describing the network @param flags command-line arguments @param optimizer pointer to the optimization class @param log_periods list containing the step intervals at which 1) logs should be printed, 2) logs should be saved for TensorBoard and 3) variables should be saved @param resume should training be resumed (or restarted from scratch)? @return the number of training steps performed since the first call to ''train'' """ # clearing the training directory if not resume: if tf.gfile.Exists(train_dir): tf.gfile.DeleteRecursively(train_dir) tf.gfile.MakeDirs(train_dir) print(''Training the network in directory %s...'' % train_dir) global_step = tf.Variable(0, trainable = False) # creating a placeholder, set to ones, used to assess the importance of each pixel mask, ones = _mask(images, batch_size, flags) # building a Graph that computes the logits predictions from the inference model keep_prob = tf.placeholder_with_default(0.5, []) logits = network(images * mask, keep_prob) # creating the optimizer if optimizer == tf.train.MomentumOptimizer: opt = optimizer(flags.learning_rate, flags.momentum) else: opt = optimizer(flags.learning_rate) # calculating the semantic loss, defined as the classification or regression loss if flags.boosting_weights is not None and os.path.isfile(flags.boosting_weights): boosting_weights_value = np.loadtxt(flags.boosting_weights, dtype = np.float32, delimiter = '','') boosting_weights = tf.placeholder_with_default(boosting_weights_value, list(boosting_weights_value.shape), name = ''boosting_weights'') semantic_loss = _boosting_loss(logits, ids, boosting_weights, flags) else: semantic_loss = _loss(logits, labels, flags) tf.add_to_collection(''losses'', semantic_loss) # computing the loss gradient with respect to the mask (i.e. the insight tensor) and # penalizing its L1-norm # replace ''semantic_loss'' with ''tf.reduce_sum(logits)''? insight = tf.gradients(semantic_loss, [mask])[0] insight_loss = tf.reduce_sum(tf.abs(insight)) if flags.insight_loss > 0.0: with tf.control_dependencies([semantic_loss]): tf.add_to_collection(''losses'', tf.multiply(flags.insight_loss, insight_loss, name = ''insight_loss'')) else: tf.summary.scalar(''insight_loss_raw'', insight_loss) # summing all loss factors and computing the moving average of all individual losses and of # the sum loss = tf.add_n(tf.get_collection(''losses''), name = ''total_loss'') loss_averages_op = tf.train.ExponentialMovingAverage(0.9, name = ''avg'') losses = tf.get_collection(''losses'') loss_averages = loss_averages_op.apply(losses + [loss]) # attaching a scalar summary to all individual losses and the total loss; # do the same for the averaged version of the losses for l in losses + [loss]: tf.summary.scalar(l.op.name + ''_raw'', l) tf.summary.scalar(l.op.name + ''_avg'', loss_averages_op.average(l)) # computing and applying gradients with tf.control_dependencies([loss_averages]): grads = opt.compute_gradients(loss) apply_gradient = opt.apply_gradients(grads, global_step = global_step) # adding histograms for trainable variables and gradients for var in tf.trainable_variables(): tf.summary.histogram(var.op.name, var) for grad, var in grads: if grad is not None: tf.summary.histogram(var.op.name + ''/gradients'', grad) tf.summary.histogram(''insight'', insight) # tracking the moving averages of all trainable variables variable_averages_op = tf.train.ExponentialMovingAverage(flags.moving_average_decay, global_step) variable_averages = variable_averages_op.apply(tf.trainable_variables()) # building a Graph that trains the model with one batch of examples and # updates the model parameters with tf.control_dependencies([apply_gradient, variable_averages]): train_op = tf.no_op(name = ''train'') # creating a saver saver = tf.train.Saver(tf.global_variables()) # building the summary operation based on the TF collection of Summaries summary_op = tf.summary.merge_all() # creating a session current_global_step = -1 with tf.Session(config = tf.ConfigProto(log_device_placement = False, inter_op_parallelism_threads = flags.num_cpus, device_count = {''GPU'': flags.num_gpus})) as sess: # initializing variables if flags.checkpoint_exclude_scopes is not None: optimistic_restore(sess, os.path.join(train_dir, ''inception_V4.ckpt''), flags) # starting the queue runners .. # creating a summary writer .. # training itself .. # saving the model checkpoint checkpoint_path = os.path.join(train_dir, ''model.ckpt'') saver.save(sess, checkpoint_path, global_step = current_global_step) # stopping the queue runners .. return current_global_step

Agregué una bandera a la secuencia de comandos python llamada checkpoint_exclude_scopes donde precisé qué Tensors no deberían restaurarse. Esto es necesario para cambiar el número de clases en la última capa de la red. Así es como llamo al script de python:

./toolDetectionInceptions.py --batch_size=32 --running_mode=resume --checkpoint_exclude_scopes=InceptionV4/Logits,InceptionV4/AuxLogits

Mis primeras pruebas fueron terribles porque tuve demasiados problemas ... algo así como:

tensorflow.python.framework.errors.NotFoundError: Tensor name "InceptionV4/Mixed_6b/Branch_3/Conv2d_0b_1x1/weights/read:0" not found in checkpoint files

Después de buscar en Google podría encontrar una solución alternativa en este sitio donde propongan utilizar la función optimistic_restore presentada en el código anterior, incluidas algunas modificaciones.

Pero ahora el problema es otra cosa:

W tensorflow/core/framework/op_kernel.cc:993] Failed precondition: Attempting to use uninitialized value Variable [[Node: Variable/read = Identity[T=DT_INT32, _class=["loc:@Variable"], _device="/job:localhost/replica:0/task:0/cpu:0"](Variable)]]

Parece que hay una variable local que no está inicializada pero no pude encontrarla. ¿Puedes ayudarnos?

EDITADO:

Para solucionar este problema, verifiqué el número de variables que deberían inicializarse y restaurarse mediante la adición de algunos registros en la función optimistic_restore . Aquí hay un breve:

# saved_shapes 609 # var_names 608 # name2var 1519 # variables_to_init: 7 # restore_vars: 596 # global_variables: 1519

Para su información, CheckpointReader.get_variable_to_shape_map(): devuelve un dict asignando nombres de tensor a listas de ints, que representan la forma del tensor correspondiente en el punto de control. Esto significa que el número de variables en este punto de control es 609 y el número total de variables necesarias para la restauración es 1519 .

Parece que hay una gran brecha entre los tensores preestablecidos del punto de control y las variables utilizadas por la arquitectura de la red (en realidad también es su red). ¿Hay algún tipo de compresión en el punto de control? ¿Es exacto lo que estoy diciendo? Ahora sé lo que falta: es solo la inicialización de las variables que no se han restaurado. Sin embargo, necesito saber por qué hay una gran diferencia entre su arquitectura de red InceptionV4 y el punto de control preinformado.


Aquí es cómo debería definir la función optimistic_restore para que funcione como se espera:

def optimistic_restore(session, save_file, flags): if flags.checkpoint_exclude_scopes is not None: exclusions = [scope.strip() for scope in flags.checkpoint_exclude_scopes.split('','')] reader = tf.train.NewCheckpointReader(save_file) saved_shapes = reader.get_variable_to_shape_map() print (''saved_shapes %d'' % len(saved_shapes)) var_names = sorted([(var.name, var.name.split('':'')[0]) for var in tf.global_variables() if var.name.split('':'')[0] in saved_shapes]) var_names_to_be_initialized = sorted([(var.name, var.name.split('':'')[0]) for var in tf.global_variables() if var.name.split('':'')[0] not in saved_shapes]) print(''var_names %d'' % len(var_names)) print(''var_names_to_be_initialized %d'' % len(var_names_to_be_initialized)) restore_vars = [] name2var = dict(zip(map(lambda x: x.name.split('':'')[0], tf.global_variables()), tf.global_variables())) print(''name2var %d'' % len(name2var)) with tf.variable_scope('''', reuse=True): variables_to_init = [] for var_name, saved_var_name in var_names: curr_var = name2var[saved_var_name] var_shape = curr_var.get_shape().as_list() if var_shape == saved_shapes[saved_var_name]: excluded = False for exclusion in exclusions: if saved_var_name.startswith(exclusion): variables_to_init.append(curr_var) excluded = True break if not excluded: restore_vars.append(curr_var) else: variables2_to_init.append(curr_var) for var_name, saved_var_name in var_names_to_be_initialized: curr_var = name2var[saved_var_name] variables2_to_init.append(curr_var) print(''variables2_to_init : %d '' % len(variables_to_init)) print(''global_variables: %d '' % len(tf.global_variables())) print(''restore_vars: %d '' % len(restore_vars)) saver = tf.train.Saver(restore_vars) saver.restore(session, save_file) session.run(tf.variables_initializer(variables_to_init))


Las variables que no se restauran con el protector deben inicializarse. Para este fin, puede ejecutar v.initializer.run() para cada variable v que no restaure.