从PowerPoint幻灯片中提取图表数据

Question

从PowerPoint幻灯片中提取图表数据

4

假设有一个包含图表数据的幻灯片文件，如何将图表数据提取为数据帧？也就是说，给定tempf.pptx文件，如何检索iris数据集？

library(magrittr)
library(mschart)
library(officer)

linec <- ms_linechart(data = iris, x = "Sepal.Length",
                      y = "Sepal.Width", group = "Species")
linec <- chart_ax_y(linec, num_fmt = "0.00", rotation = -90)

doc <- read_pptx()
doc <- add_slide(doc, layout = "Title and Content", master = "Office Theme")
doc <- ph_with_chart(doc, chart = linec)

print(doc, target = tempf.pptx <- tempfile(fileext = ".pptx"))

- Hugh

3个回答

1

"剪切和粘贴"是一种严重有缺陷的反模式，不利于可重复的代码、分析或自动化（这些都是数据科学工作流程中所追求的）。这是一个入门级代码，可以让您获得数据元素（但仍需要一些“动手工作”）。"

library(xml2)
library(magrittr)

# temp holding space for the unzipped PPTX
td <- tempfile("dir")

# unzip it and keep file names
fils <- unzip(tempf.pptx, exdir = td)

# look for chart XML files
charts <- fils[grepl("chart.*\\.xml$", fils)]

# read in the first one
chart <- read_xml(charts[1])

现在我们已经找到并读取了一个图表XML文件，让我们看看能否确定它是哪种类型的图表：

# find charts in the XML (i don't know if there can be more than one per-XML file)
(embedded_charts <- xml_find_all(chart, ".//c:chart/c:plotArea"))
## {xml_nodeset (1)}
## [1] <c:plotArea xmlns:c="http://schemas.openxmlformats.org/drawingml/200 ...

# get the node root of the first one (again, i'm not sure if there can be more than one)
(first_embed <- embedded_charts[1])
## {xml_nodeset (1)}
## [1] <c:plotArea xmlns:c="http://schemas.openxmlformats.org/drawingml/200 ...

# use it to get the kind of chart so we can target the values with it
(xml_children(first_embed) %>%
  xml_name() %>%
  grep("Chart", ., value=TRUE) -> embed_kind)
## [1] "lineChart"

现在我们可以尝试查找该图表的数据系列。

(target <- xml_find_first(first_embed, sprintf(".//c:%s", embed_kind)))
## {xml_nodeset (1)}
## [1] <c:lineChart>\n  <c:grouping val="standard"/>\n  <c:varyColors val=" ...

# extract "column" metadata
col_refs <- xml_find_all(target, ".//c:ser/c:tx/c:strRef")
(xml_find_all(col_refs, ".//c:f") %>%
    sapply(xml_text) -> col_specs)
## [1] "sheet1!$B$1" "sheet1!$C$1" "sheet1!$D$1"

(xml_find_all(col_refs, ".//c:v") %>%
    sapply(xml_text))
## [1] "setosa"     "versicolor" "virginica"

提取“X”元数据和数据：

x_val_refs <- xml_find_all(target, ".//c:cat")
(lapply(x_val_refs, xml_find_all, ".//c:f") %>%
    sapply(xml_text) -> x_val_specs)
## [1] "sheet1!$A$2:$A$36" "sheet1!$A$2:$A$36" "sheet1!$A$2:$A$36"

(lapply(x_val_refs, xml_find_all, ".//c:v") %>%
    sapply(xml_double) -> x_vals)
##       [,1] [,2] [,3]
##  [1,]  4.3  4.3  4.3
##  [2,]  4.4  4.4  4.4
##  [3,]  4.5  4.5  4.5
##  [4,]  4.6  4.6  4.6
##  [5,]  4.7  4.7  4.7
##  [6,]  4.8  4.8  4.8
##  [7,]  4.9  4.9  4.9
##  [8,]  5.0  5.0  5.0
##  [9,]  5.1  5.1  5.1
## [10,]  5.2  5.2  5.2
## [11,]  5.3  5.3  5.3
## [12,]  5.4  5.4  5.4
## [13,]  5.5  5.5  5.5
## [14,]  5.6  5.6  5.6
## [15,]  5.7  5.7  5.7
## [16,]  5.8  5.8  5.8
## [17,]  5.9  5.9  5.9
## [18,]  6.0  6.0  6.0
## [19,]  6.1  6.1  6.1
## [20,]  6.2  6.2  6.2
## [21,]  6.3  6.3  6.3
## [22,]  6.4  6.4  6.4
## [23,]  6.5  6.5  6.5
## [24,]  6.6  6.6  6.6
## [25,]  6.7  6.7  6.7
## [26,]  6.8  6.8  6.8
## [27,]  6.9  6.9  6.9
## [28,]  7.0  7.0  7.0
## [29,]  7.1  7.1  7.1
## [30,]  7.2  7.2  7.2
## [31,]  7.3  7.3  7.3
## [32,]  7.4  7.4  7.4
## [33,]  7.6  7.6  7.6
## [34,]  7.7  7.7  7.7
## [35,]  7.9  7.9  7.9

提取“Y”元数据和数据：

y_val_refs <- xml_find_all(target, ".//c:val")
(lapply(y_val_refs, xml_find_all, ".//c:f") %>%
    sapply(xml_text) -> y_val_specs)
## [1] "sheet1!$B$2:$B$36" "sheet1!$C$2:$C$36" "sheet1!$D$2:$D$36"

(lapply(y_val_refs, xml_find_all, ".//c:v") %>%
    sapply(xml_double) -> y_vals)
## [[1]]
##  [1] 3.0 3.2 2.3 3.2 3.2 3.0 3.6 3.3 3.8 4.1 3.7 3.4 3.5 3.8 4.0
## 
## [[2]]
##  [1] 2.4 2.3 2.5 2.7 3.0 2.6 2.7 2.8 2.6 3.2 3.4 3.0 2.9 2.3 2.9 2.8 3.0
## [18] 3.1 2.8 3.1 3.2
## 
## [[3]]
##  [1] 2.5 2.8 2.5 2.7 3.0 3.0 2.6 3.4 2.5 3.1 3.0 3.0 3.2 3.1 3.0 3.0 2.9
## [18] 2.8 3.0 3.0 3.8

# see if there are X & Y titles
title_nodes <- xml_find_all(first_embed, ".//c:title")
(lapply(title_nodes, xml_find_all, ".//a:t") %>%
    sapply(xml_text) -> titles)
## [1] "Sepal.Length" "Sepal.Width"

与我创建docxtractr软件包（用于从Word文档中获取表格）的动机不同，我并没有看到对上述习语的特别需求，因此我不确定在不久的将来是否会有一个软件包。

- hrbrmstr

0

我不知道有没有办法从R内部获取数据，但是您可以打开 pptx 文件，右键单击图表，然后选择“编辑数据”以查看表格形式的基础数据。然后可以使用方便的 datapasta 包将其复制并粘贴到R数据框中。

- Jon Spring

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- David Gohel · Accepted Answer

另一种方法是直接导入与图表相关的xls文件：

tempdir <- tempfile()
officer::unpack_folder(tempf.pptx, tempdir)
xl_file <- list.files(tempdir, recursive = TRUE, full.names = TRUE, pattern = "\\.xlsx$")
readxl::read_excel(xl_file)

注意：此代码仅适用于PPTX文件中只有一个数据集的情况。如果有多个文件，则应读取“关系文件”*.xml.rels以确保我们导入正确的xlsx文件（xl引用存储在ppt/charts/_rels/chart_file_title.xml.rels中）。