在Bash脚本中,我想要将一行内容分割为多个片段并存储到一个数组中。
例如,给定以下这一行:
Paris, France, Europe
我希望最终的数组看起来像这样:
array[0] = Paris
array[1] = France
array[2] = Europe
最好使用简单的实现方式,速度不重要。我该如何做?
# in: $1=delim, $2=string
# out: sets array a
# naive approach - slow
split_byStr_bash_naive(){
a=()
local prev=""
local cdr="$2"
[[ -z "${cdr}" ]] && a+=("")
while [[ "$cdr" != "$prev" ]]; do
prev="$cdr"
a+=( "${cdr%%"$1"*}" )
cdr="${cdr#*"$1"}"
done
# echo $( declare -p a | md5sum; declare -p a )
}
# use lengths wherever possible - faster
split_byStr_bash_faster(){
a=()
local car=""
local cdr="$2"
while
car="${cdr%%"$1"*}"
a+=("$car")
cdr="${cdr:${#car}}"
(( ${#cdr} ))
do
cdr="${cdr:${#1}}"
done
# echo $( declare -p a | md5sum; declare -p a )
}
# use pattern substitution and readarray - fastest
split_byStr_bash_sub(){
a=()
local delim="$1" string="$2"
delim="${delim//=/=-}"
delim="${delim//$'\n'/=n}"
string="${string//=/=-}"
string="${string//$'\n'/=n}"
readarray -td $'\n' a <<<"${string//"$delim"/$'\n'}"
local len=${#a[@]} i s
for (( i=0; i<len; i++ )); do
s="${a[i]//=n/$'\n'}"
a[i]="${s//=-/=}"
done
# echo $( declare -p a | md5sum; declare -p a )
}
-z
测试处理了传递零长度字符串的情况。如果没有这个测试,输出数组将为空;而有了这个测试,数组将有一个零长度的元素。readarray
替换为while read
会导致不到10%的减速。
split_byRE_gawk(){
readarray -td '' a < <(awk '{gsub(/'"$1"'/,"\0")}1' <<<"$2$1")
unset 'a[-1]'
# echo $( declare -p a | md5sum; declare -p a )
}
split_byREorStr_gawk(){
local delim=$1
local string=$2
local useRegex=${3:+1} # if set, delimiter is regex
readarray -td '' a < <(
export delim
gawk -v re="$useRegex" '
BEGIN {
RS = FS = "\0"
ORS = ""
d = ENVIRON["delim"]
# cf. https://dev59.com/hJbfa4cB1Zd3GeqPyMCE#37039138
if (!re) gsub(/[\\.^$(){}\[\]|*+?]/,"\\\\&",d)
}
gsub(d"|\n$","\0")
' <<<"$string"
)
# echo $( declare -p a | md5sum; declare -p a )
}
split_byREorStr_perl(){
local delim=$1
local string=$2
local regex=$3 # if set, delimiter is regex
readarray -td '' a < <(
export delim regex
perl -0777pe '
$d = $ENV{delim};
$d = "\Q$d\E" if ! $ENV{regex};
s/$d|\n$/\0/g;
' <<<"$string"
)
# echo $( declare -p a | md5sum; declare -p a )
}
--
和字符串a-
或a---
:
declare -a a=([0]="a")
或declare -a a=([0]="a" [1]="")
declare -a a=([0]="a-")
或declare -a a=([0]="a" [1]="-")
delim="-=-="
base="ABCDEFGHIJKLMNOPQRSTUVWXYZ012345"
d=$(perl -e "print( '$delim' x (7*2**$n) )")
s=$(perl -e "print( '$delim' x (7*2**$n) . '$base' x (7*2**$n) )")
n | #s | #d | gawk | b_sub | b_faster | b_naive | |
---|---|---|---|---|---|---|---|
0 | 252 | 28 | 0.002 | 0.000 | 0.000 | 0.000 | |
1 | 504 | 56 | 0.005 | 0.000 | 0.000 | 0.001 | |
2 | 1008 | 112 | 0.005 | 0.001 | 0.000 | 0.003 | |
3 | 2016 | 224 | 0.006 | 0.001 | 0.000 | 0.009 | |
4 | 4032 | 448 | 0.007 | 0.002 | 0.001 | 0.048 | |
= | 5 | 8064 | 896 | 0.014 | 0.008 | 0.005 | 0.377 |
6 | 16128 | 1792 | 0.018 | 0.029 | 0.017 | (2.214) | |
7 | 32256 | 3584 | 0.033 | 0.057 | 0.039 | (15.16) | |
! | 8 | 64512 | 7168 | 0.063 | 0.214 | 0.128 | - |
9 | 129024 | 14336 | 0.111 | (0.826) | (0.602) | - | |
10 | 258048 | 28672 | 0.214 | (3.383) | (2.652) | - | |
!! | 11 | 516096 | 57344 | 0.430 | (13.46) | (11.00) | - |
12 | 1032192 | 114688 | (0.834) | (58.38) | - | - | |
13 | 2064384 | 229376 | <!> | (228.9) | - | - |
类型2
d=$(perl -e "print( '$delim' x ($n) )")
s=$(perl -e "print( ('$delim' x ($n) . '$base' x $n ) x (2**($n-1)) )")
n | #s | #d | gawk | b_sub | b_faster | b_naive | |
---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0.003 | 0.000 | 0.000 | 0.000 | |
1 | 36 | 4 | 0.003 | 0.000 | 0.000 | 0.000 | |
2 | 144 | 8 | 0.005 | 0.000 | 0.000 | 0.000 | |
3 | 432 | 12 | 0.005 | 0.000 | 0.000 | 0.000 | |
4 | 1152 | 16 | 0.005 | 0.001 | 0.001 | 0.002 | |
5 | 2880 | 20 | 0.005 | 0.001 | 0.002 | 0.003 | |
6 | 6912 | 24 | 0.006 | 0.003 | 0.009 | 0.014 | |
= | 7 | 16128 | 28 | 0.012 | 0.012 | 0.037 | 0.044 |
8 | 36864 | 32 | 0.023 | 0.044 | 0.167 | 0.187 | |
! | 9 | 82944 | 36 | 0.049 | 0.192 | (0.753) | (0.840) |
10 | 184320 | 40 | 0.097 | (0.925) | (3.682) | (4.016) | |
11 | 405504 | 44 | 0.204 | (4.709) | (18.00) | (19.58) | |
!! | 12 | 884736 | 48 | 0.444 | (22.17) | - | - |
13 | 1916928 | 52 | (1.019) | (102.4) | - | - |
类型3
d=$(perl -e "print( '$delim' x (2**($n-1)) )")
s=$(perl -e "print( ('$delim' x (2**($n-1)) . '$base' x (2**($n-1)) ) x ($n) )")
n | #s | #d | gawk | b_sub | b_faster | b_naive | |
---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0.000 | 0.000 | 0.000 | 0.000 | |
1 | 36 | 4 | 0.004 | 0.000 | 0.000 | 0.000 | |
2 | 144 | 8 | 0.003 | 0.000 | 0.000 | 0.000 | |
3 | 432 | 16 | 0.003 | 0.000 | 0.000 | 0.000 | |
4 | 1152 | 32 | 0.005 | 0.001 | 0.001 | 0.002 | |
5 | 2880 | 64 | 0.005 | 0.002 | 0.001 | 0.003 | |
6 | 6912 | 128 | 0.006 | 0.003 | 0.003 | 0.014 | |
= | 7 | 16128 | 256 | 0.012 | 0.011 | 0.010 | 0.077 |
8 | 36864 | 512 | 0.023 | 0.046 | 0.046 | (0.513) | |
! | 9 | 82944 | 1024 | 0.049 | 0.195 | 0.197 | (3.850) |
10 | 184320 | 2048 | 0.103 | (0.951) | (1.061) | (31.84) | |
11 | 405504 | 4096 | 0.222 | (4.796) | - | - | |
!! | 12 | 884736 | 8192 | 0.473 | (22.88) | - | - |
13 | 1916928 | 16384 | (1.126) | (105.4) | - | - |
由于短分隔符可能比长分隔符更常见,下面总结了分隔符长度在1到10之间的结果(2到9的结果大多被省略,因为非常相似)。
s1=$(perl -e "print( '$d' . '$base' x (7*2**$n) )")
s2=$(perl -e "print( ('$d' . '$base' x $n ) x (2**($n-1)) )")
s3=$(perl -e "print( ('$d' . '$base' x (2**($n-1)) ) x ($n) )")
bash_sub < gawk
string | n | #s | #d | gawk | b_sub | b_faster | b_naive |
---|---|---|---|---|---|---|---|
s1 | 10 | 229377 | 1 | 0.131 | 0.089 | 1.709 | - |
s1 | 10 | 229386 | 10 | 0.142 | 0.095 | 1.907 | - |
s2 | 8 | 32896 | 1 | 0.022 | 0.007 | 0.148 | 0.168 |
s2 | 8 | 34048 | 10 | 0.021 | 0.021 | 0.163 | 0.179 |
s3 | 12 | 786444 | 1 | 0.436 | 0.468 | - | - |
s3 | 12 | 786456 | 2 | 0.434 | 0.317 | - | - |
s3 | 12 | 786552 | 10 | 0.438 | 0.333 | - | - |
bash_sub < 0.5s
string | n | #s | #d | gawk | b_sub | b_faster | b_naive |
---|---|---|---|---|---|---|---|
s1 | 11 | 458753 | 1 | 0.256 | 0.332 | (7.089) | - |
s1 | 11 | 458762 | 10 | 0.269 | 0.387 | (8.003) | - |
s2 | 11 | 361472 | 1 | 0.205 | 0.283 | (14.54) | - |
s2 | 11 | 363520 | 3 | 0.207 | 0.462 | (16.66) | - |
s3 | 12 | 786444 | 1 | 0.436 | 0.468 | - | - |
s3 | 12 | 786456 | 2 | 0.434 | 0.317 | - | - |
s3 | 12 | 786552 | 10 | 0.438 | 0.333 | - | - |
gawk < 0.5秒
string | n | #s | $d | gawk | b_sub | b_faster | b_naive |
---|---|---|---|---|---|---|---|
s1 | 11 | 458753 | 1 | 0.256 | 0.332 | (7.089) | - |
s1 | 11 | 458762 | 10 | 0.269 | 0.387 | (8.003) | - |
s2 | 12 | 788480 | 1 | 0.440 | (1.252) | - | - |
s2 | 12 | 806912 | 10 | 0.449 | (4.968) | - | - |
s3 | 12 | 786444 | 1 | 0.436 | 0.468 | - | - |
s3 | 12 | 786456 | 2 | 0.434 | 0.317 | - | - |
s3 | 12 | 786552 | 10 | 0.438 | 0.333 | - | - |
输入代码
纯bash多字符分隔符解决方案。
正如其他人在本主题中指出的那样,原帖提供了一个逗号分隔的字符串示例以被分解为数组,但并未指明他/她是否只关心逗号分隔符、单个字符分隔符或多个字符分隔符。
由于Google倾向于将此答案排名在搜索结果的顶部或附近,我想提供一个强有力的答案给读者关于多个字符分隔符的问题,因为这也至少在一次回复中提到。
如果您正在寻找多个字符分隔符的解决方案,我建议查看Mallikarjun M的帖子,特别是gniourf_gniourf提供的这个优雅的纯BASH解决方案,使用参数扩展:
#!/bin/bash
str="LearnABCtoABCSplitABCaABCString"
delimiter=ABC
s=$str$delimiter
array=();
while [[ $s ]]; do
array+=( "${s%%"$delimiter"*}" );
s=${s#*"$delimiter"};
done;
declare -p array
链接到引用的评论/参考帖子
链接到引用的问题:如何在bash中使用多字符分隔符拆分字符串?
2022年8月3日更新
xebeche在下面的评论中提出了一个很好的观点。经过审核其建议的编辑,我修改了由gniourf_gniourf提供的脚本,并添加了注释以便理解该脚本正在做什么。我还将双括号[[]]更改为单括号,以增强兼容性,因为许多Shell变体不支持双括号表示法。在这种情况下,对于BaSH,逻辑在单括号或双括号内均可正常工作。
#!/bin/bash
str="LearnABCtoABCSplitABCABCaABCStringABC"
delimiter="ABC"
array=()
while [ "$str" ]; do
# parse next sub-string, left of next delimiter
substring="${str%%"$delimiter"*}"
# when substring = delimiter, truncate leading delimiter
# (i.e. pattern is "$delimiter$delimiter")
[ -z "$substring" ] && str="${str#"$delimiter"}" && continue
# create next array element with parsed substring
array+=( "$substring" )
# remaining string to the right of delimiter becomes next string to be evaluated
str="${str:${#substring}}"
# prevent infinite loop when last substring = delimiter
[ "$str" == "$delimiter" ] && break
done
declare -p array
没有注释:
#!/bin/bash
str="LearnABCtoABCSplitABCABCaABCStringABC"
delimiter="ABC"
array=()
while [ "$str" ]; do
substring="${str%%"$delimiter"*}"
[ -z "$substring" ] && str="${str#"$delimiter"}" && continue
array+=( "$substring" )
str="${str:${#substring}}"
[ "$str" == "$delimiter" ] && break
done
declare -p array
#!/bin/bash
string="a | b c"
pattern=' | '
# replaces pattern with newlines
splitted="$(sed "s/$pattern/\n/g" <<< "$string")"
# Reads lines and put them in array
readarray -t array2 <<< "$splitted"
# Prints number of elements
echo ${#array2[@]}
# Prints all elements
for a in "${array2[@]}"; do
echo "> '$a'"
done
对于更大的分隔符(多个字符),此解决方案有效。
如果原始字符串中已经有换行符,则无法使用此方法。
$ aaa='Paris, France, Europe'
$ mapfile -td ',' aaaa < <(echo -n "${aaa//, /,}")
$ declare -p aaaa
结果:
declare -a aaaa=([0]="Paris" [1]="France" [2]="Europe")
它还可以用于包含空格的扩展数据,例如“纽约”:
$ aaa="New York, Paris, New Jersey, Hampshire"
$ mapfile -td ',' aaaa < <(echo -n "${aaa//, /,}")
$ declare -p aaaa
结果:
declare -a aaaa=([0]="New York" [1]="Paris" [2]="New Jersey" [3]="Hampshire")
# Usage: split_str_by "a,b,c" , # result is in "${__ret[@]}"
split_str_by(){
local s="$1" sep="$2" el
__ret=()
while true; do
el="${s%%"$sep"*}"
__ret+=("$el")
# If no sep was left, quit
[[ "$el" == "$s" ]] && break
s="${s#*"$sep"}"
done
return 0
}
# some tests:
split_str_by "a,b,c" ,
declare -p __ret # __ret=([0]="a" [1]="b" [2]="c")
split_str_by ",a,,b,c," ,
declare -p __ret # __ret=([0]="" [1]="a" [2]="" [3]="b" [4]="c" [5]="")
split_str_by ",,a,b,,,c,," ,,
declare -p __ret # __ret=([0]="" [1]="a,b" [2]=",c" [3]="")
split_str_by " *a *b *c *" ' *'
declare -p __ret # __ret=([0]="" [1]="a" [2]="b" [3]="c" [4]="")
split_str_by "--aa--bb--cc" '--'
declare -p __ret # declare -a __ret=([0]="" [1]="aa" [2]="bb" [3]="cc")
while el=...; __ret+=...; [[ $el != $s ]]; do s=...; done
来去掉丑陋的 true
和 break
,而且我认为 return 0
是默认行为。 - jhncsplit_byStr_bash_naive
,而无需显式测试空字符串,并且似乎具有可比较的性能。 - jhnc试试这个
IFS=', '; array=(Paris, France, Europe)
for item in ${array[@]}; do echo $item; done
很简单。如果你想的话,也可以添加一个声明(并且去掉逗号):
IFS=' ';declare -a array=(Paris France Europe)
另一种方法是在不修改IFS的情况下进行操作:
read -r -a myarray <<< "${string//, /$IFS}"
"${string//, /$IFS}"
将所有出现的所需分隔符 ", "
替换为 $IFS
的内容。
也许对于非常大的字符串来说,这可能会很慢?上述方法都没有帮助到我。最终我使用了awk解决了问题。如果有人需要,可以参考一下:
STRING="value1,value2,value3"
array=`echo $STRING | awk -F ',' '{ s = $1; for (i = 2; i <= NF; i++) s = s "\n"$i; print s; }'`
for word in ${array}
do
echo "This is the word $word"
done
更新:由于eval存在问题,请勿这样做。
稍微简单一点:
IFS=', ' eval 'array=($string)'
例如
string="foo, bar,baz"
IFS=', ' eval 'array=($string)'
echo ${array[1]} # -> bar
$
,然后你就会看到……我写了很多脚本,从来没有用过一个 eval
。 - caesarsolread -a my_array <<< $(echo ${INPUT_STRING} | tr -d ' ' | tr ',' ' ')
cut
是一个有用的 bash 命令,也可以定义分隔符。https://en.wikibooks.org/wiki/Cut 你还可以从固定宽度的记录结构中提取数据。https://en.wikipedia.org/wiki/Cut_(Unix) https://www.computerhope.com/unix/ucut.htm - JGFMK