natural_sorted
,它的行为类似于内置函数sorted
:# Copyright (C) 2018, Benjamin Drung <bdrung@posteo.de>
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
import re
def natural_sorted(iterable, key=None, reverse=False):
"""Return a new naturally sorted list from the items in *iterable*.
The returned list is in natural sort order. The string is ordered
lexicographically (using the Unicode code point number to order individual
characters), except that multi-digit numbers are ordered as a single
character.
Has two optional arguments which must be specified as keyword arguments.
*key* specifies a function of one argument that is used to extract a
comparison key from each list element: ``key=str.lower``. The default value
is ``None`` (compare the elements directly).
*reverse* is a boolean value. If set to ``True``, then the list elements are
sorted as if each comparison were reversed.
The :func:`natural_sorted` function is guaranteed to be stable. A sort is
stable if it guarantees not to change the relative order of elements that
compare equal --- this is helpful for sorting in multiple passes (for
example, sort by department, then by salary grade).
"""
prog = re.compile(r"(\d+)")
def alphanum_key(element):
"""Split given key in list of strings and digits"""
return [int(c) if c.isdigit() else c for c in prog.split(key(element)
if key else element)]
return sorted(iterable, key=alphanum_key, reverse=reverse)
源代码也可在我的 GitHub 代码片段存储库中找到:https://github.com/bdrung/snippets/blob/master/natural_sorted.py
List[Tuple(str, int)]
的紧凑解决方案。
def string_to_pairs(s, pairs=re.compile(r"(\D*)(\d*)").findall):
return [(text.lower(), int(digits or 0)) for (text, digits) in pairs(s)[:-1]]
sorted(['Elm11', 'Elm12', 'Elm2', 'elm0', 'elm1', 'elm10', 'elm13', 'elm9'], key=string_to_pairs)
输出:
['elm0', 'elm1', 'Elm2', 'elm9', 'elm10', 'Elm11', 'Elm12', 'elm13']
assert string_to_pairs("") == []
assert string_to_pairs("123") == [("", 123)]
assert string_to_pairs("abc") == [("abc", 0)]
assert string_to_pairs("123abc") == [("", 123), ("abc", 0)]
assert string_to_pairs("abc123") == [("abc", 123)]
assert string_to_pairs("123abc456") == [("", 123), ("abc", 456)]
assert string_to_pairs("abc123efg") == [("abc", 123), ("efg", 0)]
# Some extracts from the test suite of the natsort library. Permalink:
# https://github.com/SethMMorton/natsort/blob/e3c32f5638bf3a0e9a23633495269bea0e75d379/tests/test_natsorted.py
sort_data = [
( # same as test_natsorted_can_sort_as_unsigned_ints_which_is_default()
["a50", "a51.", "a50.31", "a-50", "a50.4", "a5.034e1", "a50.300"],
["a5.034e1", "a50", "a50.4", "a50.31", "a50.300", "a51.", "a-50"],
),
( # same as test_natsorted_numbers_in_ascending_order()
["a2", "a5", "a9", "a1", "a4", "a10", "a6"],
["a1", "a2", "a4", "a5", "a6", "a9", "a10"],
),
( # same as test_natsorted_can_sort_as_version_numbers()
["1.9.9a", "1.11", "1.9.9b", "1.11.4", "1.10.1"],
["1.9.9a", "1.9.9b", "1.10.1", "1.11", "1.11.4"],
),
( # different from test_natsorted_handles_filesystem_paths()
[
"/p/Folder (10)/file.tar.gz",
"/p/Folder (1)/file (1).tar.gz",
"/p/Folder/file.x1.9.tar.gz",
"/p/Folder (1)/file.tar.gz",
"/p/Folder/file.x1.10.tar.gz",
],
[
"/p/Folder (1)/file (1).tar.gz",
"/p/Folder (1)/file.tar.gz",
"/p/Folder (10)/file.tar.gz",
"/p/Folder/file.x1.9.tar.gz",
"/p/Folder/file.x1.10.tar.gz",
],
),
( # same as test_natsorted_path_extensions_heuristic()
[
"Try.Me.Bug - 09 - One.Two.Three.[text].mkv",
"Try.Me.Bug - 07 - One.Two.5.[text].mkv",
"Try.Me.Bug - 08 - One.Two.Three[text].mkv",
],
[
"Try.Me.Bug - 07 - One.Two.5.[text].mkv",
"Try.Me.Bug - 08 - One.Two.Three[text].mkv",
"Try.Me.Bug - 09 - One.Two.Three.[text].mkv",
],
),
( # same as ns.IGNORECASE for test_natsorted_supports_case_handling()
["Apple", "corn", "Corn", "Banana", "apple", "banana"],
["Apple", "apple", "Banana", "banana", "corn", "Corn"],
),
]
for (given, expected) in sort_data:
assert sorted(given, key=string_to_pairs) == expected
如果你的字符串中混合了非ASCII文本和数字,你可能会对将 string_to_pairs()
与我在其他地方提供的函数remove_diacritics()
进行组合感兴趣。
functools.cmp_to_key()
很可能与 Python 排序的底层实现密切相关。此外,cmp 参数是遗留的。现代方法是将输入项转换为支持所需丰富比较操作的对象。Python 2.7.12 (default, Sep 29 2016, 13:30:34)
>>> (0,"foo") < ("foo",0)
True
Python 3.5.2 (default, Oct 14 2016, 12:54:53)
>>> (0,"foo") < ("foo",0)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
TypeError: unorderable types: int() < str()
有三种不同的方法。第一种使用嵌套类,利用Python的Iterable
比较算法。第二种将这种嵌套展开成单个类。第三种放弃了子类化str
,专注于性能。所有方法都计时;第二种方法的速度是第一种的两倍,而第三种方法几乎快6倍。子类化str
并非必须,而且在第一次尝试时可能是个坏主意,但它确实带来了某些便利。
排序字符被复制以强制按大小写顺序排序,并交换大小写字母以使小写字母先进行排序;这是“自然排序”的典型定义。我无法确定分组类型;有些人可能更喜欢以下方式,这也带来了显著的性能优势:
d = lambda s: s.lower()+s.swapcase()
如果使用了比较运算符,则将其设置为object
的运算符,以便它们不会被functools.total_ordering
忽略。
import functools
import itertools
@functools.total_ordering
class NaturalStringA(str):
def __repr__(self):
return "{}({})".format\
( type(self).__name__
, super().__repr__()
)
d = lambda c, s: [ c.NaturalStringPart("".join(v))
for k,v in
itertools.groupby(s, c.isdigit)
]
d = classmethod(d)
@functools.total_ordering
class NaturalStringPart(str):
d = lambda s: "".join(c.lower()+c.swapcase() for c in s)
d = staticmethod(d)
def __lt__(self, other):
if not isinstance(self, type(other)):
return NotImplemented
try:
return int(self) < int(other)
except ValueError:
if self.isdigit():
return True
elif other.isdigit():
return False
else:
return self.d(self) < self.d(other)
def __eq__(self, other):
if not isinstance(self, type(other)):
return NotImplemented
try:
return int(self) == int(other)
except ValueError:
if self.isdigit() or other.isdigit():
return False
else:
return self.d(self) == self.d(other)
__le__ = object.__le__
__ne__ = object.__ne__
__gt__ = object.__gt__
__ge__ = object.__ge__
def __lt__(self, other):
return self.d(self) < self.d(other)
def __eq__(self, other):
return self.d(self) == self.d(other)
__le__ = object.__le__
__ne__ = object.__ne__
__gt__ = object.__gt__
__ge__ = object.__ge__
import functools
import itertools
@functools.total_ordering
class NaturalStringB(str):
def __repr__(self):
return "{}({})".format\
( type(self).__name__
, super().__repr__()
)
d = lambda s: "".join(c.lower()+c.swapcase() for c in s)
d = staticmethod(d)
def __lt__(self, other):
if not isinstance(self, type(other)):
return NotImplemented
groups = map(lambda i: itertools.groupby(i, type(self).isdigit), (self, other))
zipped = itertools.zip_longest(*groups)
for s,o in zipped:
if s is None:
return True
if o is None:
return False
s_k, s_v = s[0], "".join(s[1])
o_k, o_v = o[0], "".join(o[1])
if s_k and o_k:
s_v, o_v = int(s_v), int(o_v)
if s_v == o_v:
continue
return s_v < o_v
elif s_k:
return True
elif o_k:
return False
else:
s_v, o_v = self.d(s_v), self.d(o_v)
if s_v == o_v:
continue
return s_v < o_v
return False
def __eq__(self, other):
if not isinstance(self, type(other)):
return NotImplemented
groups = map(lambda i: itertools.groupby(i, type(self).isdigit), (self, other))
zipped = itertools.zip_longest(*groups)
for s,o in zipped:
if s is None or o is None:
return False
s_k, s_v = s[0], "".join(s[1])
o_k, o_v = o[0], "".join(o[1])
if s_k and o_k:
s_v, o_v = int(s_v), int(o_v)
if s_v == o_v:
continue
return False
elif s_k or o_k:
return False
else:
s_v, o_v = self.d(s_v), self.d(o_v)
if s_v == o_v:
continue
return False
return True
__le__ = object.__le__
__ne__ = object.__ne__
__gt__ = object.__gt__
__ge__ = object.__ge__
import functools
import itertools
import enum
class OrderingType(enum.Enum):
PerWordSwapCase = lambda s: s.lower()+s.swapcase()
PerCharacterSwapCase = lambda s: "".join(c.lower()+c.swapcase() for c in s)
class NaturalOrdering:
@classmethod
def by(cls, ordering):
def wrapper(string):
return cls(string, ordering)
return wrapper
def __init__(self, string, ordering=OrderingType.PerCharacterSwapCase):
self.string = string
self.groups = [ (k,int("".join(v)))
if k else
(k,ordering("".join(v)))
for k,v in
itertools.groupby(string, str.isdigit)
]
def __repr__(self):
return "{}({})".format\
( type(self).__name__
, self.string
)
def __lesser(self, other, default):
if not isinstance(self, type(other)):
return NotImplemented
for s,o in itertools.zip_longest(self.groups, other.groups):
if s is None:
return True
if o is None:
return False
s_k, s_v = s
o_k, o_v = o
if s_k and o_k:
if s_v == o_v:
continue
return s_v < o_v
elif s_k:
return True
elif o_k:
return False
else:
if s_v == o_v:
continue
return s_v < o_v
return default
def __lt__(self, other):
return self.__lesser(other, default=False)
def __le__(self, other):
return self.__lesser(other, default=True)
def __eq__(self, other):
if not isinstance(self, type(other)):
return NotImplemented
for s,o in itertools.zip_longest(self.groups, other.groups):
if s is None or o is None:
return False
s_k, s_v = s
o_k, o_v = o
if s_k and o_k:
if s_v == o_v:
continue
return False
elif s_k or o_k:
return False
else:
if s_v == o_v:
continue
return False
return True
# functools.total_ordering doesn't create single-call wrappers if both
# __le__ and __lt__ exist, so do it manually.
def __gt__(self, other):
op_result = self.__le__(other)
if op_result is NotImplemented:
return op_result
return not op_result
def __ge__(self, other):
op_result = self.__lt__(other)
if op_result is NotImplemented:
return op_result
return not op_result
# __ne__ is the only implied ordering relationship, it automatically
# delegates to __eq__
>>> import natsort
>>> import timeit
>>> l1 = ['Apple', 'corn', 'apPlE', 'arbour', 'Corn', 'Banana', 'apple', 'banana']
>>> l2 = list(map(str, range(30)))
>>> l3 = ["{} {}".format(x,y) for x in l1 for y in l2]
>>> print(timeit.timeit('sorted(l3+["0"], key=NaturalStringA)', number=10000, globals=globals()))
362.4729259099986
>>> print(timeit.timeit('sorted(l3+["0"], key=NaturalStringB)', number=10000, globals=globals()))
189.7340817489967
>>> print(timeit.timeit('sorted(l3+["0"], key=NaturalOrdering.by(OrderingType.PerCharacterSwapCase))', number=10000, globals=globals()))
69.34636392899847
>>> print(timeit.timeit('natsort.natsorted(l3+["0"], alg=natsort.ns.GROUPLETTERS | natsort.ns.LOWERCASEFIRST)', number=10000, globals=globals()))
98.2531585780016
unicodedata.normalize(...)
,并考虑使用str.casefold()
而不是str.lower()
。可能还存在我没有考虑到的微妙编码问题。因此,我暂时推荐natsort库。我快速浏览了github存储库;代码维护一直非常出色。[0-9]
,创建这样的排序将同样令人望而生畏。如果您想要区域设置感知比较,请使用locale.strxfrm
按照Python的Sorting HOW TO准备字符串。padzero_with_lower
,其定义如下:import re
def padzero_with_lower(s):
return re.sub(r'\d+', lambda m: m.group(0).rjust(10, '0'), s).lower()
该算法会做以下处理:
下面是一个使用示例:
print(padzero_with_lower('file1.txt')) # file0000000001.txt
print(padzero_with_lower('file12.txt')) # file0000000012.txt
print(padzero_with_lower('file23.txt')) # file0000000023.txt
print(padzero_with_lower('file123.txt')) # file0000000123.txt
print(padzero_with_lower('file301.txt')) # file0000000301.txt
print(padzero_with_lower('Dir2/file15.txt')) # dir0000000002/file0000000015.txt
print(padzero_with_lower('dir2/file123.txt')) # dir0000000002/file0000000123.txt
print(padzero_with_lower('dir15/file2.txt')) # dir0000000015/file0000000002.txt
print(padzero_with_lower('Dir15/file15.txt')) # dir0000000015/file0000000015.txt
print(padzero_with_lower('elm0')) # elm0000000000
print(padzero_with_lower('elm1')) # elm0000000001
print(padzero_with_lower('Elm2')) # elm0000000002
print(padzero_with_lower('elm9')) # elm0000000009
print(padzero_with_lower('elm10')) # elm0000000010
print(padzero_with_lower('Elm11')) # elm0000000011
print(padzero_with_lower('Elm12')) # elm0000000012
print(padzero_with_lower('elm13')) # elm0000000013
通过测试这个函数,现在我们可以将其用作我们的密钥,即:
lis = ['elm0', 'elm1', 'Elm2', 'elm9', 'elm10', 'Elm11', 'Elm12', 'elm13']
lis.sort(key=padzero_with_lower)
print(lis)
# Output: ['elm0', 'elm1', 'Elm2', 'elm9', 'elm10', 'Elm11', 'Elm12', 'elm13']
casefold()
而不是 lower()
来匹配字符串list.sort
、sorted
、max
等一起使用def natural_sort(key=None, _nsre=re.compile('([0-9]+)')):
return lambda x: [int(text) if text.isdigit() else text.casefold()
for text in _nsre.split(key(x) if key else x)]
示例用法:
# Original solution
data.sort(key=natural_sort())
# Select an additional key
image_files.sort(key=natural_sort(lambda x: x.original_filename))
def sort_naturally(lst: list) -> list:
max_str_len = max([len(s) for s in lst])
return sorted(lst, key=lambda s: s.zfill(max_str_len + 1))
请注意内置函数str.zfill(width)
会返回一个左侧填充ASCII 0
数字的字符串副本,使其长度为width。详细信息请参阅官方文档:docs.python.org/3/library/stdtypes.html#str.zfill
def natural_sort_key(string_or_number):
"""
by Scott S. Lawton <scott@ProductArchitect.com> 2014-12-11; public domain and/or CC0 license
handles cases where simple 'int' approach fails, e.g.
['0.501', '0.55'] floating point with different number of significant digits
[0.01, 0.1, 1] already numeric so regex and other string functions won't work (and aren't required)
['elm1', 'Elm2'] ASCII vs. letters (not case sensitive)
"""
def try_float(astring):
try:
return float(astring)
except:
return astring
if isinstance(string_or_number, basestring):
string_or_number = string_or_number.lower()
if len(re.findall('[.]\d', string_or_number)) <= 1:
# assume a floating point value, e.g. to correctly sort ['0.501', '0.55']
# '.' for decimal is locale-specific, e.g. correct for the Anglosphere and Asia but not continental Europe
return [try_float(s) for s in re.split(r'([\d.]+)', string_or_number)]
else:
# assume distinct fields, e.g. IP address, phone number with '.', etc.
# caveat: might want to first split by whitespace
# TBD: for unicode, replace isdigit with isdecimal
return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_or_number)]
else:
# consider: add code to recurse for lists/tuples and perhaps other iterables
return string_or_number
以下是测试代码和几个链接(在StackOverflow内外): http://productarchitect.com/code/better-natural-sort.py
欢迎反馈。这并不意味着是一个确定的解决方案;只是向前迈出的一步。
在@Mark Byers的回答之后,这里是一个接受key
参数并且更符合PEP8规范的修改版。
def natsorted(seq, key=None):
def convert(text):
return int(text) if text.isdigit() else text
def alphanum(obj):
if key is not None:
return [convert(c) for c in re.split(r'([0-9]+)', key(obj))]
return [convert(c) for c in re.split(r'([0-9]+)', obj)]
return sorted(seq, key=alphanum)
我还创建了一个Gist。
key
参数?但这也在@beauburrier的答案中有例子。 - Ciprian Tomoiagădef natural_sort(l, attrib):
convert = lambda text: int(text) if text.isdigit() else text.lower()
alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key.__dict__[attrib])]
return sorted(l, key=alphanum_key)
results = natural_sort(albums, 'albumid')
其中albums
是一个Album实例的列表,而albumid
是一个字符串属性,通常包含数字。
仅供记录,这里是 Mark Byers 的简单解决方案的另一种变体,类似于 Walter Tross 建议的方法,避免调用 isdigit()
。 这不仅使它更快,而且避免了因为 isdigit()
将更多的 Unicode 字符视为数字而可能出现的问题,而正则表达式 \d+
不会。
import re
from itertools import cycle
_re_digits = re.compile(r"(\d+)")
def natural_comparison_key(key):
return tuple(
int(part) if is_digit else part
for part, is_digit in zip(_re_digits.split(key), cycle((False, True)))
)