TensorFlow内存泄漏无法分配固定大小的主机内存

6

我已经构建了一个卷积模型,但发现我的代码或Tensorflow的代码存在严重的内存泄漏。有人能找出问题所在并提供洞见吗?

以下是一个最小可复现示例及其一些输出:

Process.py:

import os
import sys

import tensorflow as tf
import Input

import os, re

FLAGS = tf.app.flags.FLAGS
TOWER_NAME = 'tower'

tf.app.flags.DEFINE_integer('batch_size', 1, "hello")
tf.app.flags.DEFINE_string('data_dir', '/home/zan/Desktop/Neural-Network-Prostate', "hello")

NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = Input.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN
NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = Input.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL
NUM_EPOCHS_PER_DECAY = 3

MOVING_AVERAGE_DECAY = 0.9999
NUM_EPOCHS_PER_DECAY = 30
LEARNING_RATE_DECAY_FACTOR = 0.1
INITIAL_LEARNING_RATE = 0.1


def _activation_summary(x):
  tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name)
  tf.histogram_summary(tensor_name + '/activations', x)
  tf.scalar_summary(tensor_name + '/sparsity', tf.nn.zero_fraction(x))

def inputs():
  if not FLAGS.data_dir:
    raise ValueError('Source Data Missing')
  data_dir = FLAGS.data_dir
  images, labels = Input.inputs(data_dir = data_dir, batch_size = FLAGS.batch_size)
  return images, labels


def eval_inputs():
  data_dir = FLAGS.data_dir
  images, labels = Input.eval_inputs(data_dir = data_dir, batch_size = 1)
  return images, labels

def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape = shape)
    return tf.Variable(initial)

def conv2d(images, W):
    return tf.nn.conv2d(images, W, strides = [1, 1, 1, 1], padding = 'SAME')

def max_pool_5x5(images):
    return tf.nn.max_pool(images, ksize = [1, 5, 5, 1], strides = [1, 5, 5, 1], padding = 'SAME')

def forward_propagation(images):
  with tf.variable_scope('conv1') as scope:
      W_conv1 = weight_variable([5, 5, 3, 32])
      b_conv1 = bias_variable([32])
      image_matrix = tf.reshape(images, [-1, 1750, 1750, 3])
      h_conv1 = tf.nn.sigmoid(conv2d(image_matrix, W_conv1) + b_conv1)
      _activation_summary(h_conv1)
      h_pool1 = max_pool_5x5(h_conv1)

  with tf.variable_scope('conv2') as scope:
      W_conv2 = weight_variable([5, 5, 32, 64])
      b_conv2 = bias_variable([64])
      h_conv2 = tf.nn.sigmoid(conv2d(h_pool1, W_conv2) + b_conv2)
      _activation_summary(h_conv2)
      h_pool2 = max_pool_5x5(h_conv2)

  with tf.variable_scope('conv3') as scope:
      W_conv3 = weight_variable([5, 5, 64, 128])
      b_conv3 = bias_variable([128])
      h_conv3 = tf.nn.sigmoid(conv2d(h_pool2, W_conv3) + b_conv3)
      _activation_summary(h_conv3)
      h_pool3 = max_pool_5x5(h_conv3)

  with tf.variable_scope('local3') as scope:
      W_fc1 = weight_variable([14 * 14 * 128, 256])
      b_fc1 = bias_variable([256])
      h_pool3_flat = tf.reshape(h_pool3, [-1, 14 * 14 * 128])
      h_fc1 = tf.nn.sigmoid(tf.matmul(h_pool3_flat, W_fc1) + b_fc1)
      _activation_summary(h_fc1)
      keep_prob = tf.Variable(1.0)
      W_fc2 = weight_variable([256, 4])
      b_fc2 = bias_variable([4])
      y_conv = tf.nn.softmax(tf.matmul(h_fc1, W_fc2) + b_fc2)
      _activation_summary(y_conv)
      return y_conv

def error(forward_propagation_results, labels):
    labels = tf.one_hot(labels, 4)
    tf.transpose(labels)
    labels = tf.cast(labels, tf.float32)
    mean_squared_error = tf.square(tf.sub(labels, forward_propagation_results))
    cost = tf.reduce_mean(mean_squared_error)
    #train_loss = tf.train.GradientDescentOptimizer(learning_rate = 0.05).minimize(cost)
    tf.histogram_summary('accuracy', mean_squared_error)
    tf.add_to_collection('losses', cost)

    tf.scalar_summary('LOSS', cost)

    return cost

def _add_loss_summaries(cost):
  loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
  losses = tf.get_collection('LOSS')
  loss_averages_op = loss_averages.apply(losses + [cost])

  for l in losses + [cost]:
    tf.scalar_summary(l.op.name +' (raw)', l)
    tf.scalar_summary(l.op.name, loss_averages.average(l))

  return loss_averages_op

def train(cost, global_step):
    num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size
    decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)

    lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                  global_step,
                                  decay_steps,
                                  LEARNING_RATE_DECAY_FACTOR,
                                  staircase=True)
    tf.scalar_summary('learning_rate', lr)

    loss_averages_op = _add_loss_summaries(cost)

    with tf.control_dependencies([loss_averages_op]):
      opt = tf.train.GradientDescentOptimizer(lr)
      grads = opt.compute_gradients(cost)

    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)


    for var in tf.trainable_variables():
      tf.histogram_summary(var.op.name, var)

    for grad, var in grads:
      if grad is not None:
        tf.histogram_summary(var.op.name + '/gradients', grad)

    variable_averages = tf.train.ExponentialMovingAverage(
        MOVING_AVERAGE_DECAY, global_step)
    variables_averages_op = variable_averages.apply(tf.trainable_variables())

    with tf.control_dependencies([apply_gradient_op, variables_averages_op]):
      train_op = tf.no_op(name='train')

    return train_op

main.py:

import Process

import time
import numpy as np
import os

import tensorflow as tf
from datetime import datetime

FLAGS = tf.app.flags.FLAGS


def train():
    with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

        images, labels = Process.inputs()

        forward_propgation_results = Process.forward_propagation(images)

        cost = Process.error(forward_propgation_results, labels)

        train_op = Process.train(cost, global_step)

        image_summary_t = tf.image_summary(images.name, images, max_images = 2)

        summary_op = tf.merge_all_summaries()

        init = tf.initialize_all_variables()

        saver = tf.train.Saver()

        sess = tf.InteractiveSession()

        sess.run(init)

        saver = tf.train.Saver(tf.all_variables())

        tf.train.start_queue_runners(sess = sess)

        train_dir = "/home/zan/nn-data"

        summary_writer = tf.train.SummaryWriter(train_dir, sess.graph)

        for step in xrange(650):
            start_time = time.time()
            _, loss_value = sess.run([train_op, cost])
            duration = time.time() - start_time

            assert not np.isnan(loss_value)

            if step % 1 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = ('%s: step %d, (%.1f examples/sec; %.3f ''sec/batch)')
                print (format_str % (datetime.now(), step, examples_per_sec, sec_per_batch))

                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)


            if step % 20 or (step + 1) == 20:
                checkpoint_path = os.path.join(train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)

def main(argv = None):
    train()

if __name__ == '__main__':
  tf.app.run()

幸运的是我及时杀死了该进程,下面是程序记录的输出。
输出:
I tensorflow/core/common_runtime/gpu/gpu_device.cc:838] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 960, pci bus id: 0000:03:00.0)

E tensorflow/stream_executor/cuda/cuda_driver.cc:997] failed to alloc 8589934592 bytes on host: CUDA_ERROR_OUT_OF_MEMORY
W ./tensorflow/core/common_runtime/gpu/pool_allocator.h:195] could not allocate pinned host memory of size: 8589934592
E tensorflow/stream_executor/cuda/cuda_driver.cc:997] failed to alloc 7730940928 bytes on host: CUDA_ERROR_OUT_OF_MEMORY
W ./tensorflow/core/common_runtime/gpu/pool_allocator.h:195] could not allocate pinned host memory of size: 7730940928

哪一行代码导致了错误?看起来你正在尝试分配比你的GPU更多的内存。 - sygi
我其实不确定它确切的位置,但肯定是在训练函数中某处。我的GPU有4GB的内存,但我的数据肯定不会超过24MB以用于训练。只有在我修改代码后才出现了这个错误。 - Zan Huang
这是有效的Python代码吗(或者说是正确的Python代码)?step % 20 or (step + 1) == 20 我知道Python有奇怪的条件评估...但这看起来就不对。 - Lunaweaver
我非常确定它是正确的。我从Google的CIFAR10实现中借用了一些代码片段。这是其中一个循环编写的内容。 - Zan Huang
错误信息指向GPU中的8GB分配,这是导致OOM的原因。我不知道有比在https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/log_memory.cc中更容易追踪此问题的方法,而同时更改IsEnabled始终返回true来重新构建。 - Alexandre Passos
我认为如果没有提供输入和一些关于如何运行它的解释,那就不足以作为最小可重现示例。 - Tim Boddy
1个回答

0
你看到的内存不足错误是CPU内存不足,而不是GPU内存不足;被提到的固定主机是指CPU。

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接