publicas - variable def python
Recree la capa final de InceptionV4 para nuevas categorÃas: variable local no inicializada (2)
Todavía soy novato en tensorflow así que lo siento si esta es una pregunta ingenua. Estoy tratando de usar el modelo inception_V4
ImageNet
conjunto de datos ImageNet
publicado en este sitio . Además, estoy usando su red tal como es, me refiero a la que se publica en su sitio .
Así es como llamo a la red:
def network(images_op, keep_prob):
width_needed_InceptionV4Net = 342
shape = images_op.get_shape().as_list()
H = int(round(width_needed_InceptionV4Net * shape[1] / shape[2], 2))
resized_images = tf.image.resize_images(images_op, [width_needed_InceptionV4Net, H], tf.image.ResizeMethod.BILINEAR)
with slim.arg_scope(inception.inception_v4_arg_scope()):
logits, _ = inception.inception_v4(resized_images, num_classes=20, is_training=True, dropout_keep_prob = keep_prob)
return logits
Como necesito volver a entrenar la capa final de Inception_V4
para mis categorías, modifiqué el número de clases a 20, como puede ver en la llamada al método ( inception.inception_v4
).
Este es el método de tren que tengo hasta ahora:
def optimistic_restore(session, save_file, flags):
reader = tf.train.NewCheckpointReader(save_file)
saved_shapes = reader.get_variable_to_shape_map()
var_names = sorted([(var.name, var.name.split('':'')[0]) for var in tf.global_variables()
if var.name.split('':'')[0] in saved_shapes])
restore_vars = []
name2var = dict(zip(map(lambda x:x.name.split('':'')[0], tf.global_variables()), tf.global_variables()))
if flags.checkpoint_exclude_scopes is not None:
exclusions = [scope.strip() for scope in flags.checkpoint_exclude_scopes.split('','')]
with tf.variable_scope('''', reuse=True):
variables_to_init = []
for var_name, saved_var_name in var_names:
curr_var = name2var[saved_var_name]
var_shape = curr_var.get_shape().as_list()
if var_shape == saved_shapes[saved_var_name]:
print(saved_var_name)
excluded = False
for exclusion in exclusions:
if saved_var_name.startswith(exclusion):
variables_to_init.append(var)
excluded = True
break
if not excluded:
restore_vars.append(curr_var)
saver = tf.train.Saver(restore_vars)
saver.restore(session, save_file)
def train(images, ids, labels, total_num_examples, batch_size, train_dir, network, flags,
optimizer, log_periods, resume):
"""!@brief Trains the network for a number of steps.
@param images image tensor
@param ids id tensor
@param labels label tensor
@param total_num_examples total number of training examples
@param batch_size batch size
@param train_dir directory where checkpoints should be saved
@param network pointer to a function describing the network
@param flags command-line arguments
@param optimizer pointer to the optimization class
@param log_periods list containing the step intervals at which 1) logs should be printed,
2) logs should be saved for TensorBoard and 3) variables should be saved
@param resume should training be resumed (or restarted from scratch)?
@return the number of training steps performed since the first call to ''train''
"""
# clearing the training directory
if not resume:
if tf.gfile.Exists(train_dir):
tf.gfile.DeleteRecursively(train_dir)
tf.gfile.MakeDirs(train_dir)
print(''Training the network in directory %s...'' % train_dir)
global_step = tf.Variable(0, trainable = False)
# creating a placeholder, set to ones, used to assess the importance of each pixel
mask, ones = _mask(images, batch_size, flags)
# building a Graph that computes the logits predictions from the inference model
keep_prob = tf.placeholder_with_default(0.5, [])
logits = network(images * mask, keep_prob)
# creating the optimizer
if optimizer == tf.train.MomentumOptimizer:
opt = optimizer(flags.learning_rate, flags.momentum)
else:
opt = optimizer(flags.learning_rate)
# calculating the semantic loss, defined as the classification or regression loss
if flags.boosting_weights is not None and os.path.isfile(flags.boosting_weights):
boosting_weights_value = np.loadtxt(flags.boosting_weights, dtype = np.float32,
delimiter = '','')
boosting_weights = tf.placeholder_with_default(boosting_weights_value,
list(boosting_weights_value.shape),
name = ''boosting_weights'')
semantic_loss = _boosting_loss(logits, ids, boosting_weights, flags)
else:
semantic_loss = _loss(logits, labels, flags)
tf.add_to_collection(''losses'', semantic_loss)
# computing the loss gradient with respect to the mask (i.e. the insight tensor) and
# penalizing its L1-norm
# replace ''semantic_loss'' with ''tf.reduce_sum(logits)''?
insight = tf.gradients(semantic_loss, [mask])[0]
insight_loss = tf.reduce_sum(tf.abs(insight))
if flags.insight_loss > 0.0:
with tf.control_dependencies([semantic_loss]):
tf.add_to_collection(''losses'', tf.multiply(flags.insight_loss, insight_loss,
name = ''insight_loss''))
else:
tf.summary.scalar(''insight_loss_raw'', insight_loss)
# summing all loss factors and computing the moving average of all individual losses and of
# the sum
loss = tf.add_n(tf.get_collection(''losses''), name = ''total_loss'')
loss_averages_op = tf.train.ExponentialMovingAverage(0.9, name = ''avg'')
losses = tf.get_collection(''losses'')
loss_averages = loss_averages_op.apply(losses + [loss])
# attaching a scalar summary to all individual losses and the total loss;
# do the same for the averaged version of the losses
for l in losses + [loss]:
tf.summary.scalar(l.op.name + ''_raw'', l)
tf.summary.scalar(l.op.name + ''_avg'', loss_averages_op.average(l))
# computing and applying gradients
with tf.control_dependencies([loss_averages]):
grads = opt.compute_gradients(loss)
apply_gradient = opt.apply_gradients(grads, global_step = global_step)
# adding histograms for trainable variables and gradients
for var in tf.trainable_variables():
tf.summary.histogram(var.op.name, var)
for grad, var in grads:
if grad is not None:
tf.summary.histogram(var.op.name + ''/gradients'', grad)
tf.summary.histogram(''insight'', insight)
# tracking the moving averages of all trainable variables
variable_averages_op = tf.train.ExponentialMovingAverage(flags.moving_average_decay,
global_step)
variable_averages = variable_averages_op.apply(tf.trainable_variables())
# building a Graph that trains the model with one batch of examples and
# updates the model parameters
with tf.control_dependencies([apply_gradient, variable_averages]):
train_op = tf.no_op(name = ''train'')
# creating a saver
saver = tf.train.Saver(tf.global_variables())
# building the summary operation based on the TF collection of Summaries
summary_op = tf.summary.merge_all()
# creating a session
current_global_step = -1
with tf.Session(config = tf.ConfigProto(log_device_placement = False,
inter_op_parallelism_threads = flags.num_cpus,
device_count = {''GPU'': flags.num_gpus})) as sess:
# initializing variables
if flags.checkpoint_exclude_scopes is not None:
optimistic_restore(sess, os.path.join(train_dir, ''inception_V4.ckpt''), flags)
# starting the queue runners
..
# creating a summary writer
..
# training itself
..
# saving the model checkpoint
checkpoint_path = os.path.join(train_dir, ''model.ckpt'')
saver.save(sess, checkpoint_path, global_step = current_global_step)
# stopping the queue runners
..
return current_global_step
Agregué una bandera a la secuencia de comandos python llamada checkpoint_exclude_scopes
donde precisé qué Tensors no deberían restaurarse. Esto es necesario para cambiar el número de clases en la última capa de la red. Así es como llamo al script de python:
./toolDetectionInceptions.py --batch_size=32 --running_mode=resume --checkpoint_exclude_scopes=InceptionV4/Logits,InceptionV4/AuxLogits
Mis primeras pruebas fueron terribles porque tuve demasiados problemas ... algo así como:
tensorflow.python.framework.errors.NotFoundError: Tensor name "InceptionV4/Mixed_6b/Branch_3/Conv2d_0b_1x1/weights/read:0" not found in checkpoint files
Después de buscar en Google podría encontrar una solución alternativa en este sitio donde propongan utilizar la función optimistic_restore
presentada en el código anterior, incluidas algunas modificaciones.
Pero ahora el problema es otra cosa:
W tensorflow/core/framework/op_kernel.cc:993] Failed precondition: Attempting to use uninitialized value Variable
[[Node: Variable/read = Identity[T=DT_INT32, _class=["loc:@Variable"], _device="/job:localhost/replica:0/task:0/cpu:0"](Variable)]]
Parece que hay una variable local que no está inicializada pero no pude encontrarla. ¿Puedes ayudarnos?
EDITADO:
Para solucionar este problema, verifiqué el número de variables que deberían inicializarse y restaurarse mediante la adición de algunos registros en la función optimistic_restore
. Aquí hay un breve:
# saved_shapes 609
# var_names 608
# name2var 1519
# variables_to_init: 7
# restore_vars: 596
# global_variables: 1519
Para su información, CheckpointReader.get_variable_to_shape_map():
devuelve un dict asignando nombres de tensor a listas de ints, que representan la forma del tensor correspondiente en el punto de control. Esto significa que el número de variables en este punto de control es 609
y el número total de variables necesarias para la restauración es 1519
.
Parece que hay una gran brecha entre los tensores preestablecidos del punto de control y las variables utilizadas por la arquitectura de la red (en realidad también es su red). ¿Hay algún tipo de compresión en el punto de control? ¿Es exacto lo que estoy diciendo? Ahora sé lo que falta: es solo la inicialización de las variables que no se han restaurado. Sin embargo, necesito saber por qué hay una gran diferencia entre su arquitectura de red InceptionV4
y el punto de control preinformado.
Aquí es cómo debería definir la función optimistic_restore
para que funcione como se espera:
def optimistic_restore(session, save_file, flags):
if flags.checkpoint_exclude_scopes is not None:
exclusions = [scope.strip() for scope in flags.checkpoint_exclude_scopes.split('','')]
reader = tf.train.NewCheckpointReader(save_file)
saved_shapes = reader.get_variable_to_shape_map()
print (''saved_shapes %d'' % len(saved_shapes))
var_names = sorted([(var.name, var.name.split('':'')[0]) for var in tf.global_variables()
if var.name.split('':'')[0] in saved_shapes])
var_names_to_be_initialized = sorted([(var.name, var.name.split('':'')[0]) for var in tf.global_variables()
if var.name.split('':'')[0] not in saved_shapes])
print(''var_names %d'' % len(var_names))
print(''var_names_to_be_initialized %d'' % len(var_names_to_be_initialized))
restore_vars = []
name2var = dict(zip(map(lambda x: x.name.split('':'')[0], tf.global_variables()), tf.global_variables()))
print(''name2var %d'' % len(name2var))
with tf.variable_scope('''', reuse=True):
variables_to_init = []
for var_name, saved_var_name in var_names:
curr_var = name2var[saved_var_name]
var_shape = curr_var.get_shape().as_list()
if var_shape == saved_shapes[saved_var_name]:
excluded = False
for exclusion in exclusions:
if saved_var_name.startswith(exclusion):
variables_to_init.append(curr_var)
excluded = True
break
if not excluded:
restore_vars.append(curr_var)
else:
variables2_to_init.append(curr_var)
for var_name, saved_var_name in var_names_to_be_initialized:
curr_var = name2var[saved_var_name]
variables2_to_init.append(curr_var)
print(''variables2_to_init : %d '' % len(variables_to_init))
print(''global_variables: %d '' % len(tf.global_variables()))
print(''restore_vars: %d '' % len(restore_vars))
saver = tf.train.Saver(restore_vars)
saver.restore(session, save_file)
session.run(tf.variables_initializer(variables_to_init))
Las variables que no se restauran con el protector deben inicializarse. Para este fin, puede ejecutar v.initializer.run()
para cada variable v
que no restaure.