我正在尝试找出一种有效的方式来分割像这样的字符串
"111110000011110000111000"
转换为向量
[1] "11111" "00000" "1111" "0000" "111" "000"
"0"和"1"可以是任何交替的字符。
我正在尝试找出一种有效的方式来分割像这样的字符串
"111110000011110000111000"
转换为向量
[1] "11111" "00000" "1111" "0000" "111" "000"
"0"和"1"可以是任何交替的字符。
尝试
strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]]
#[1] "11111" "00000" "1111" "0000" "111" "000"
在 @rawr 的解决方案基础上进行了修改,使用了 stri_extract_all_regex
。
library(stringi)
stri_extract_all_regex(str1, '(?:(\\w))\\1*')[[1]]
#[1] "11111" "00000" "1111" "0000" "111" "000"
stri_extract_all_regex(x1, '(?:(\\w))\\1*')[[1]]
#[1] "11111" "00000" "222" "000" "3333" "000" "1111" "0000" "111"
#[10] "000"
stri_extract_all_regex(x2, '(?:(\\w))\\1*')[[1]]
#[1] "aaaaa" "bb" "ccccccc" "bbb" "a" "d" "11111"
#[8] "00000" "222" "aaa" "bb" "cc" "d" "11"
#[15] "D" "aa" "BB"
library(stringi)
set.seed(24)
x3 <- stri_rand_strings(1, 1e4)
akrun <- function() stri_extract_all_regex(x3, '(?:(\\w))\\1*')[[1]]
#modified @thelatemail's function to make it bit more general
thelate <- function() regmatches(x3,gregexpr("(?:(\\w))\\1*", x3,
perl=TRUE))[[1]]
rawr <- function() strsplit(x3, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
ananda <- function() unlist(read.fwf(textConnection(x3),
rle(strsplit(x3, "")[[1]])$lengths,
colClasses = "character"))
Colonel <- function() with(rle(strsplit(x3,'')[[1]]),
mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values))
Cryo <- function(){
res_vector=rep(NA_character_,nchar(x3))
res_vector[1]=substr(x3,1,1)
counter=1
old_tmp=''
for (i in 2:nchar(x3)) {
tmp=substr(x3,i,i)
if (tmp==old_tmp) {
res_vector[counter]=paste0(res_vector[counter],tmp)
} else {
res_vector[counter+1]=tmp
counter=counter+1
}
old_tmp=tmp
}
res_vector[!is.na(res_vector)]
}
richard <- function(){
cs <- cumsum(
rle(stri_split_boundaries(x3, type = "character")[[1L]])$lengths
)
stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}
nicola<-function(x) {
indices<-c(0,which(diff(as.integer(charToRaw(x)))!=0),nchar(x))
substring(x,indices[-length(indices)]+1,indices[-1])
}
richard2 <- function() {
cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}
system.time(akrun())
# user system elapsed
# 0.003 0.000 0.003
system.time(thelate())
# user system elapsed
# 0.272 0.001 0.274
system.time(rawr())
# user system elapsed
# 0.397 0.001 0.398
system.time(ananda())
# user system elapsed
# 3.744 0.204 3.949
system.time(Colonel())
# user system elapsed
# 0.154 0.001 0.154
system.time(Cryo())
# user system elapsed
# 0.220 0.005 0.226
system.time(richard())
# user system elapsed
# 0.007 0.000 0.006
system.time(nicola(x3))
# user system elapsed
# 0.190 0.001 0.191
在稍微更长的字符串上,
set.seed(24)
x3 <- stri_rand_strings(1, 1e6)
system.time(akrun())
#user system elapsed
#0.166 0.000 0.155
system.time(richard())
# user system elapsed
# 0.606 0.000 0.569
system.time(richard2())
# user system elapsed
# 0.518 0.000 0.487
system.time(Colonel())
# user system elapsed
# 9.631 0.000 9.358
library(microbenchmark)
microbenchmark(richard(), richard2(), akrun(), times=20L, unit='relative')
#Unit: relative
# expr min lq mean median uq max neval cld
# richard() 2.438570 2.633896 2.365686 2.315503 2.368917 2.124581 20 b
#richard2() 2.389131 2.533301 2.223521 2.143112 2.153633 2.157861 20 b
# akrun() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20 a
注意: 尝试运行其他方法,但需要很长时间
str1 <- "111110000011110000111000"
x1 <- "1111100000222000333300011110000111000"
x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
主题的变化:
x <- "111110000011110000111000"
regmatches(x,gregexpr("1+|0+",x))[[1]]
#[1] "11111" "00000" "1111" "0000" "111" "000"
你可以使用substr
或read.fwf
与rle
一起使用(虽然它不太可能像基于正则表达式的解决方案一样高效):
x <- "111110000011110000111000"
unlist(read.fwf(textConnection(x),
rle(strsplit(x, "")[[1]])$lengths,
colClasses = "character"))
# V1 V2 V3 V4 V5 V6
# "11111" "00000" "1111" "0000" "111" "000"
这种方法的一个优点是即使是以下情况,它也可以工作:
x <- paste(c(rep("a", 5), rep("b", 2), rep("c", 7),
rep("b", 3), rep("a", 1), rep("d", 1)), collapse = "")
x
# [1] "aaaaabbcccccccbbbad"
unlist(read.fwf(textConnection(x),
rle(strsplit(x, "")[[1]])$lengths,
colClasses = "character"))
# V1 V2 V3 V4 V5 V6
# "aaaaa" "bb" "ccccccc" "bbb" "a" "d"
另一种方式是在交替的数字之间添加空格。这适用于任何两个数字,而不仅仅是1和0。然后在空格上使用strsplit
:
x <- "111110000011110000111000"
(y <- gsub('(\\d)(?!\\1)', '\\1 \\2', x, perl = TRUE))
# [1] "11111 00000 1111 0000 111 000 "
strsplit(y, ' ')[[1]]
# [1] "11111" "00000" "1111" "0000" "111" "000"
正如@akrun所指出的那样,更简洁地说:
strsplit(x, '(?<=(\\d))(?!\\1)', perl=TRUE)[[1]]
# [1] "11111" "00000" "1111" "0000" "111" "000"
同时将\\d
更改为\\w
也可以起到同样的作用。
x <- "aaaaabbcccccccbbbad"
strsplit(x, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
# [1] "aaaaa" "bb" "ccccccc" "bbb" "a" "d"
x <- "111110000011110000111000"
strsplit(x, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
# [1] "11111" "00000" "1111" "0000" "111" "000"
你也可以使用\K
(而不是明确使用捕获组\\1
和\\2
),但我很少看到它被使用,也不知道如何解释它:}
据我所知,\\K
重置了报告匹配的起始点,并且先前已经匹配的字符不再包括在内,基本上抛弃了到那个点为止匹配的所有内容。
x <- "1111100000222000333300011110000111000"
(z <- gsub('(\\d)\\K(?!\\1)', ' ', x, perl = TRUE))
# [1] "11111 00000 222 000 3333 000 1111 0000 111 000 "
strsplit(x, '(?<=(\\d))(?!\\1)', perl=TRUE)[[1]]
(尽管我还没有用多种情况测试过它:-))。 - akrun\\K
是做什么的。 - rawr\\w
的方法在这两种情况下都应该有效。我不太常用\\K
,但我猜你在关于它的帖子中解释过了。 - akrun原始方法:这是一个包含rle()
的stringi方法。
x <- "111110000011110000111000"
library(stringi)
cs <- cumsum(
rle(stri_split_boundaries(x, type = "character")[[1L]])$lengths
)
stri_sub(x, c(1L, head(cs + 1L, -1L)), cs)
# [1] "11111" "00000" "1111" "0000" "111" "000"
或者,您可以在stri_sub()
函数中使用length
参数。
rl <- rle(stri_split_boundaries(x, type = "character")[[1L]])
with(rl, {
stri_sub(x, c(1L, head(cumsum(lengths) + 1L, -1L)), length = lengths)
})
# [1] "11111" "00000" "1111" "0000" "111" "000"
效率更新:在意识到base::strsplit()
比stringi::stri_split_boundaries()
更快之后,这里是我之前答案的更高效版本,只使用了基本函数。
set.seed(24)
x3 <- stri_rand_strings(1L, 1e6L)
system.time({
cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
substring(x3, c(1L, head(cs + 1L, -1L)), cs)
})
# user system elapsed
# 0.686 0.012 0.697
另一种方法是使用mapply
:
x="111110000011110000111000"
with(rle(strsplit(x,'')[[1]]),
mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values))
#[1] "11111" "00000" "1111" "0000" "111" "000"
regmatches
的速度,我会相信它更慢!这是因为我不知道这个函数背后隐藏着什么! - Colonel Beauvelregmatches
通常更快,但这也可能取决于所使用的regex
。在这里,我正在测试一个更一般的情况。如果我们使用与@thelatemail帖子中相同的正则表达式来测试二进制字符串,则计时可能会有所不同。 - akrun虽然这不是OP想要的简洁的R代码,但我尝试使用Rcpp
编写了一份相对简单且比最快的基于R语言的答案快5倍的代码。
library(Rcpp)
cppFunction(
'std::vector<std::string> split_str_cpp(std::string x) {
std::vector<std::string> parts;
int start = 0;
for(int i = 1; i <= x.length(); i++) {
if(x[i] != x[i-1]) {
parts.push_back(x.substr(start, i-start));
start = i;
}
}
return parts;
}')
在这些上进行测试
str1 <- "111110000011110000111000"
x1 <- "1111100000222000333300011110000111000"
x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
> split_str_cpp(str1)
[1] "11111" "00000" "1111" "0000" "111" "000"
> split_str_cpp(x1)
[1] "11111" "00000" "222" "000" "3333" "000" "1111" "0000" "111" "000"
> split_str_cpp(x2)
[1] "aaaaa" "bb" "ccccccc" "bbb" "a" "d" "11111" "00000" "222" "aaa" "bb" "cc" "d" "11"
[15] "D" "aa" "BB"
一项基准测试显示,它比R语言解决方案快5-10倍。
akrun <- function(str1) strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]]
richard1 <- function(x3){
cs <- cumsum(
rle(stri_split_boundaries(x3, type = "character")[[1L]])$lengths
)
stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}
richard2 <- function(x3) {
cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}
library(microbenchmark)
library(stringi)
set.seed(24)
x3 <- stri_rand_strings(1, 1e6)
microbenchmark(split_str_cpp(x3), akrun(x3), richard1(x3), richard2(x3), unit = 'relative', times=20L)
比较:
Unit: relative
expr min lq mean median uq max neval
split_str_cpp(x3) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20
akrun(x3) 9.675613 8.952997 8.241750 8.689001 8.403634 4.423134 20
richard1(x3) 5.355620 5.226103 5.483171 5.947053 5.982943 3.379446 20
richard2(x3) 4.842398 4.756086 5.046077 5.389570 5.389193 3.669680 20
简单的 for
循环解决方案
x="aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
res_vector=substr(x,1,1)
for (i in 2:nchar(x)) {
tmp=substr(x,i,i)
if (tmp==substr(x,i-1,i-1)) {
res_vector[length(res_vector)]=paste0(res_vector[length(res_vector)],tmp)
} else {
res_vector[length(res_vector)+1]=tmp
}
}
res_vector
#[1] "aaaaa" "bb" "ccccccc" "bbb" "a" "d" "11111" "00000" "222" "aaa" "bb" "cc" "d" "11" "D" "aa" "BB"
或者使用预先分配的结果向量,可能会稍微快一些。
x="aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
res_vector=rep(NA_character_,nchar(x))
res_vector[1]=substr(x,1,1)
counter=1
old_tmp=''
for (i in 2:nchar(x)) {
tmp=substr(x,i,i)
if (tmp==old_tmp) {
res_vector[counter]=paste0(res_vector[counter],tmp)
} else {
res_vector[counter+1]=tmp
counter=counter+1
}
old_tmp=tmp
}
res_vector[!is.na(res_vector)]
s <- "111110000011110000111000"
spl <- strsplit(s,"10|01")[[1]]
l <- length(spl)
sapply(1:l, function(i) paste0(spl[i],i%%2,ifelse(i==1 | i==l, "",i%%2)))
# [1] "11111" "00000" "1111" "0000" "111" "000"
sapply(seq_along(spl), ...)
而不必费心提取其长度作为单独的变量。 - Frank