我能否将tensorflow的摘要导出为CSV文件?

34

有没有一种方法可以从tfevents文件中提取标量摘要到CSV(最好是从tensorboard内部)?

示例代码

以下代码在同一目录中的summary_dir中生成tfevent文件。假设您让它运行并找到了一些有趣的东西。 您想获取原始数据以进行进一步研究。你怎么做?

#!/usr/bin/env python
"""A very simple MNIST classifier."""
import argparse
import sys

from tensorflow.examples.tutorials.mnist import input_data

import tensorflow as tf
ce_with_logits = tf.nn.softmax_cross_entropy_with_logits

FLAGS = None


def inference(x):
    """
    Build the inference graph.

    Parameters
    ----------
    x : placeholder

    Returns
    -------
    Output tensor with the computed logits.
    """
    W = tf.Variable(tf.zeros([784, 10]))
    b = tf.Variable(tf.zeros([10]))
    y = tf.matmul(x, W) + b
    return y


def loss(logits, labels):
    """
    Calculate the loss from the logits and the labels.

    Parameters
    ----------
    logits : Logits tensor, float - [batch_size, NUM_CLASSES].
    labels : Labels tensor, int32 - [batch_size]
    """
    cross_entropy = tf.reduce_mean(ce_with_logits(labels=labels,
                                                  logits=logits))
    return cross_entropy


def training(loss, learning_rate=0.5):
    """
    Set up the training Ops.

    Parameters
    ----------
    loss : Loss tensor, from loss().
    learning_rate : The learning rate to use for gradient descent.

    Returns
    -------
    train_op: The Op for training.
    """
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    train_step = optimizer.minimize(loss)
    return train_step


def main(_):
    # Import data
    mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)

    # Create the model
    x = tf.placeholder(tf.float32, [None, 784])
    y = inference(x)

    # Define loss and optimizer
    y_ = tf.placeholder(tf.float32, [None, 10])
    loss_ = loss(logits=y, labels=y_)
    train_step = training(loss_)

    # Test trained model
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    with tf.name_scope('accuracy'):
        tf.summary.scalar('accuracy', accuracy)
    merged = tf.summary.merge_all()

    sess = tf.InteractiveSession()
    train_writer = tf.summary.FileWriter('summary_dir/train', sess.graph)
    test_writer = tf.summary.FileWriter('summary_dir/test', sess.graph)
    tf.global_variables_initializer().run()

    for train_step_i in range(100000):
        if train_step_i % 100 == 0:
            summary, acc = sess.run([merged, accuracy],
                                    feed_dict={x: mnist.test.images,
                                               y_: mnist.test.labels})
            test_writer.add_summary(summary, train_step_i)
            summary, acc = sess.run([merged, accuracy],
                                    feed_dict={x: mnist.train.images,
                                               y_: mnist.train.labels})
            train_writer.add_summary(summary, train_step_i)
        batch_xs, batch_ys = mnist.train.next_batch(100)
        sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})

    print(sess.run(accuracy, feed_dict={x: mnist.test.images,
                                        y_: mnist.test.labels}))

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir',
                        type=str,
                        default='/tmp/tensorflow/mnist/input_data',
                        help='Directory for storing input data')
    FLAGS, unparsed = parser.parse_known_args()
    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
5个回答

42

虽然这里提供的答案符合tensorboard的要求,但它只允许下载单个标签的单次运行的csv文件。 例如,如果您有10个标记和20个运行(这并不多),您需要执行上述步骤200次(仅这一步可能需要花费您一个小时以上)。 如果现在您出于某种原因确实想对单个标签的所有运行数据进行操作,则需要编写一些奇怪的CSV累加脚本或手动复制所有内容(这可能会花费您一天以上的时间)。

因此,我想添加一种解决方案,可以提取每个标签的CSV文件,其中包含所有运行。列标题是运行路径名称,行索引是运行步骤编号。

import os
import numpy as np
import pandas as pd

from collections import defaultdict
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator


def tabulate_events(dpath):
    summary_iterators = [EventAccumulator(os.path.join(dpath, dname)).Reload() for dname in os.listdir(dpath)]

    tags = summary_iterators[0].Tags()['scalars']

    for it in summary_iterators:
        assert it.Tags()['scalars'] == tags

    out = defaultdict(list)
    steps = []

    for tag in tags:
        steps = [e.step for e in summary_iterators[0].Scalars(tag)]

        for events in zip(*[acc.Scalars(tag) for acc in summary_iterators]):
            assert len(set(e.step for e in events)) == 1

            out[tag].append([e.value for e in events])

    return out, steps


def to_csv(dpath):
    dirs = os.listdir(dpath)

    d, steps = tabulate_events(dpath)
    tags, values = zip(*d.items())
    np_values = np.array(values)

    for index, tag in enumerate(tags):
        df = pd.DataFrame(np_values[index], index=steps, columns=dirs)
        df.to_csv(get_file_path(dpath, tag))


def get_file_path(dpath, tag):
    file_name = tag.replace("/", "_") + '.csv'
    folder_path = os.path.join(dpath, 'csv')
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    return os.path.join(folder_path, file_name)


if __name__ == '__main__':
    path = "path_to_your_summaries"
    to_csv(path)

我的解决方案基于:https://dev59.com/u1gQ5IYBdhLWcg3wPxnb#48774926


编辑:

我创建了一个更复杂的版本,并在GitHub上发布它:https://github.com/Spenhouet/tensorboard-aggregator

该版本聚合了多个TensorBoard运行,并能够将聚合保存到新的TensorBoard摘要或.csv文件中。


1
非常好的解决方案!不过有一个问题:在从Tensorboard页面手动保存的CSV文件中,存在一个“Wall time”列。然而,使用你的代码时,这一列并未被保存。你有解决办法吗? 非常感谢! - f10w
真是个好的解决方案!我试图让它在摘要有不同步骤数(例如时期)的情况下也能正常工作,但没有成功。有没有简单的方法来解决这个问题?如果数据框中的列较短,我会很高兴地使用NaN值进行补偿。非常感谢您的帮助! - user1835630

36

只需在TensorBoard左上角选择“数据下载链接”选项,然后单击标量摘要下方出现的“CSV”按钮即可。

输入图片描述


你知道是否有可能自定义CSV,例如标题行吗? - Martin Thoma
至少从TensorBoard来看,我不知道。我认为最简单的方法就是创建一个修改CSV第一行的脚本。如果您使用基于Unix的操作系统,例如这样做 - jabalazs
5
为我出现的唯一选项是SVG,你知道为什么吗? - Charlie Parker

4
这是我的解决方案,基于之前的解决方案但可以进行扩展。
import os
import numpy as np
import pandas as pd

from collections import defaultdict
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator


def tabulate_events(dpath):

    final_out = {}
    for dname in os.listdir(dpath):
        print(f"Converting run {dname}",end="")
        ea = EventAccumulator(os.path.join(dpath, dname)).Reload()
        tags = ea.Tags()['scalars']

        out = {}

        for tag in tags:
            tag_values=[]
            wall_time=[]
            steps=[]

            for event in ea.Scalars(tag):
                tag_values.append(event.value)
                wall_time.append(event.wall_time)
                steps.append(event.step)

            out[tag]=pd.DataFrame(data=dict(zip(steps,np.array([tag_values,wall_time]).transpose())), columns=steps,index=['value','wall_time'])

        if len(tags)>0:      
            df= pd.concat(out.values(),keys=out.keys())
            df.to_csv(f'{dname}.csv')
            print("- Done")
        else:
            print('- Not scalers to write')

        final_out[dname] = df


    return final_out
if __name__ == '__main__':
    path = "youre/path/here"
    steps = tabulate_events(path)
    pd.concat(steps.values(),keys=steps.keys()).to_csv('all_result.csv')

3

非常简单的例子:

import pandas as pd
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator

log_dir = "lightning_logs/version_1"

event_accumulator = EventAccumulator(log_dir)
event_accumulator.Reload()

events = event_accumulator.Scalars("train_loss")
x = [x.step for x in events]
y = [x.value for x in events]

df = pd.DataFrame({"step": x, "train_loss": y})
df.to_csv("train_loss.csv")
print(df)

   step  train_loss
0     0  700.491516
1     1  163.593246
2     2  146.365448
3     3  153.830215
...

绘制损失函数与迭代次数的示例:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator

log_dir = "lightning_logs/version_1"
y_key = "val_loss"

event_accumulator = EventAccumulator(log_dir)
event_accumulator.Reload()

steps = {x.step for x in event_accumulator.Scalars("epoch")}
x = list(range(len(steps)))
y = [x.value for x in event_accumulator.Scalars(y_key) if x.step in steps]
df = pd.DataFrame({"epoch": x, y_key: y})
df.to_csv(f"{y_key}.csv")

fig, ax = plt.subplots()
sns.lineplot(data=df, x="epoch", y=y_key)
fig.savefig("plot.png", dpi=300)

plot


1

补充一下 @Spen 的观点:

如果你想在步骤数量不同时导出数据,可以尝试这个方法。它将生成一个大的 csv 文件。可能需要调整键才能让它适用于你的情况。

import os
import numpy as np
import pandas as pd

from collections import defaultdict
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
import glob
import pandas as pd
listOutput = (glob.glob("*/"))

listDF = []

for tb_output_folder in listOutput:
 print(tb_output_folder)
 x = EventAccumulator(path=tb_output_folder)
 x.Reload()
 x.FirstEventTimestamp()
 keys = ['loss', 'mean_absolute_error', 'val_loss', 'val_mean_absolute_error'] 

 listValues = {}

 steps = [e.step for e in x.Scalars(keys[0])]
 wall_time = [e.wall_time for e in x.Scalars(keys[0])]
 index = [e.index for e in x.Scalars(keys[0])]
 count = [e.count for e in x.Scalars(keys[0])]
 n_steps = len(steps)
 listRun = [tb_output_folder] * n_steps
 printOutDict = {}

 data = np.zeros((n_steps, len(keys)))
 for i in range(len(keys)):
     data[:,i] = [e.value for e in x.Scalars(keys[i])]

 printOutDict = {keys[0]: data[:,0], keys[1]: data[:,1],keys[2]: data[:,2],keys[3]: data[:,3]}

 printOutDict['Name'] = listRun

 DF = pd.DataFrame(data=printOutDict)

 listDF.append(DF)

df = pd.concat(listDF)
df.to_csv('Output.csv')   


这个非常好用!非常感谢分享 :) - Oğuz Şerbetci

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接