一个没有使用
itertools.groupby()
的解决方案:
p = ['**', 'foo', '*', 'bar', 'bar', '**', '**', '**', 'baz', '**', '**',
'foo', '*','*', 'bar', 'bar','bar', '**', '**','foo','bar',]
def treat(A):
prec = A[0]; yield prec
for x in A[1:]:
if (prec,x)!=('**','**'): yield x
prec = x
print p
print
print list(treat(p))
结果
['**', 'foo', '*', 'bar', 'bar', '**', '**', '**',
'baz', '**', '**',
'foo', '*', '*', 'bar', 'bar','bar', '**', '**',
'foo', 'bar']
['**', 'foo', '*', 'bar', 'bar', '**',
'baz', '**',
'foo', '*', '*', 'bar', 'bar', 'bar', '**',
'foo', 'bar']
另一种解决方案,灵感来自dugres
from itertools import groupby
p = ['**', 'foo', '*', 'bar', 'bar', '**', '**', '**', 'baz', '**', '**',
'foo', '*','*', 'bar', 'bar','bar', '**', '**','foo','bar',]
res = []
for k, g in groupby(p):
res.extend( ['**'] if k=='**' else list(g) )
print res
这就像Tom Zych的解决方案,但更简单
.
编辑
p = ['**','**', 'foo', '*', 'bar', 'bar', '**', '**', '**', 'baz', '**', '**',
'foo', '*','*', 'bar', 'bar','bar', '**', '**','foo','bar', '**', '**', '**']
q= ['**',12,'**',45, 'foo',78, '*',751, 'bar',4789, 'bar',3, '**', 5,'**',7, '**',
73,'baz',4, '**',8, '**',20,'foo', 8,'*',36,'*', 36,'bar', 11,'bar',0,'bar',9,
'**', 78,'**',21,'foo',27,'bar',355, '**',33, '**',37, '**','end']
def treat(B,dedupl):
B = iter(B)
prec = B.next(); yield prec
for x in B:
if not(prec==x==dedupl): yield x
prec = x
print 'gen = ( x for x in q[::2])'
gen = ( x for x in q[::2])
print 'list(gen)==p is ',list(gen)==p
gen = ( x for x in q[::2])
print 'list(treat(gen)==',list(treat(gen,'**'))
ch = '??h4i4???4t4y?45l????hmo4j5???'
print '\nch==',ch
print "''.join(treat(ch,'?'))==",''.join(treat(ch,'?'))
print "\nlist(treat([],'%%'))==",list(treat([],'%%'))
结果
gen = ( x for x in q[::2])
list(gen)==p is True
list(treat(gen)== ['**', 'foo', '*', 'bar', 'bar', '**', 'baz', '**', 'foo', '*', '*', 'bar', 'bar', 'bar', '**', 'foo', 'bar', '**']
ch== ??h4i4???4t4y?45l????hmo4j5???
''.join(treat(ch,'?'))== ?h4i4?4t4y?45l?hmo4j5?
list(treat([],'%%'))== []
.
备注:生成器函数允许通过在调用生成器周围编写来将输出适应输入类型,它不需要更改生成器函数的内部代码;
而 Tom Zynch 的解决方案并非如此容易地适应输入类型。
.
编辑 2
我搜索了一种使用列表推导或生成器表达式的单行方法。
我发现有两种方法可以做到这一点,我认为没有 groupby() 是不可能的。
from itertools import groupby
from operator import concat
p = ['**', '**','foo', '*', 'bar', 'bar', '**', '**', '**',
'bar','**','foo','sun','sun','sun']
print 'p==',p,'\n'
dedupl = ("**",'sun')
print 'dedupl==',repr(dedupl)
print [ x for k, g in groupby(p) for x in ((k,) if k in dedupl else g) ]
print reduce(concat,( [k] if k in dedupl else list(g) for k, g in groupby(p)),[])
基于同样的原理,将dugres函数转换为生成器函数非常容易:
from itertools import groupby
def compress(iterable, to_compress):
for k, g in groupby(iterable):
if k in to_compress:
yield k
else:
for x in g: yield x
然而,这个生成器函数有两个缺点:
我稍微修改了它们,使它们能够接受一个要去重的项目序列,并测量了执行时间:
from time import clock
from itertools import groupby
def squeeze(iterable, victims, _dummy=object()):
if hasattr(iterable, '__iter__') and not hasattr(victims, '__iter__'):
victims = (victims,)
previous = _dummy
for item in iterable:
if item in victims and item==previous:
continue
previous = item
yield item
def treat(B,victims):
if hasattr(B, '__iter__') and not hasattr(victims, '__iter__'):
victims = (victims,)
B = iter(B)
prec = B.next(); yield prec
for x in B:
if x not in victims or x!=prec: yield x
prec = x
def compress(iterable, to_compress):
if hasattr(iterable, '__iter__') and not hasattr(to_compress, '__iter__'):
to_compress = (to_compress,)
for k, g in groupby(iterable):
if k in to_compress:
yield k
else:
for x in g: yield x
p = ['**', '**','su','foo', '*', 'bar', 'bar', '**', '**', '**',
'su','su','**','bin', '*','*','bar','bar','su','su','su']
n = 10000
te = clock()
for i in xrange(n):
a = list(compress(p,('**','sun')))
print clock()-te,' generator function with groupby()'
te = clock()
for i in xrange(n):
b = list(treat(p,('**','sun')))
print clock()-te,' generator function eyquem'
te = clock()
for i in xrange(n):
c = list(squeeze(p,('**','sun')))
print clock()-te,' generator function John Machin'
print p
print 'a==b==c is ',a==b==c
print a
指令
if hasattr(iterable, '__iter__') and not hasattr(to_compress, '__iter__'):
to_compress = (to_compress,)
当可迭代参数为序列而另一个参数只有一个字符串时,有必要避免出现错误:这后者需要被修改成容器,前提是可迭代参数本身不是字符串。
这是基于这样一个事实:元组、列表、集合等序列有方法iter,但字符串没有。以下代码展示了问题:
def compress(iterable, to_compress):
if hasattr(iterable, '__iter__') and not hasattr( to_compress, '__iter__'):
to_compress = (to_compress,)
print 't_compress==',repr(to_compress)
for k, g in groupby(iterable):
if k in to_compress:
yield k
else:
for x in g: yield x
def compress_bof(iterable, to_compress):
if not hasattr(to_compress, '__iter__'):
to_compress = (to_compress,)
print 't_compress==',repr(to_compress)
for k, g in groupby(iterable):
if k in to_compress:
yield k
else:
for x in g: yield x
def compress_bug(iterable, to_compress_bug):
print 't_compress==',repr(to_compress_bug)
for k, g in groupby(iterable):
if k in to_compress_bug:
yield k
else:
for x in g: yield x
q = ';;;htr56;but78;;;;$$$$;ios4!'
print 'q==',q
dedupl = ";$"
print 'dedupl==',repr(dedupl)
print
print "''.join(compress (q,"+repr(dedupl)+")) :\n",''.join(compress (q,dedupl))+\
' <-CORRECT ONE'
print
print "''.join(compress_bof(q,"+repr(dedupl)+")) :\n",''.join(compress_bof(q,dedupl))+\
' <====== error ===='
print
print "''.join(compress_bug(q,"+repr(dedupl)+")) :\n",''.join(compress_bug(q,dedupl))
print '\n\n\n'
q = [';$', ';$',';$','foo', ';', 'bar','bar',';',';',';','$','$','foo',';$12',';$12']
print 'q==',q
dedupl = ";$12"
print 'dedupl==',repr(dedupl)
print
print 'list(compress (q,'+repr(dedupl)+')) :\n',list(compress (q,dedupl)),\
' <-CORRECT ONE'
print
print 'list(compress_bof(q,'+repr(dedupl)+')) :\n',list(compress_bof(q,dedupl))
print
print 'list(compress_bug(q,'+repr(dedupl)+')) :\n',list(compress_bug(q,dedupl)),\
' <====== error ===='
print
结果
q== ;;;htr56;but78;;;;$$$$;ios4!
dedupl== ';$'
''.join(compress (q,';$')) :
t_compress== ';$'
;htr56;but78;$;ios4! <-CORRECT ONE
''.join(compress_bof(q,';$')) :
t_compress== (';$',)
;;;htr56;but78;;;;$$$$;ios4! <====== error ====
''.join(compress_bug(q,';$')) :
t_compress== ';$'
;htr56;but78;$;ios4!
q== [';$', ';$', ';$', 'foo', ';', 'bar', 'bar', ';', ';', ';', '$', '$', 'foo', ';$12', ';$12']
dedupl== ';$12'
list(compress (q,';$12')) :
t_compress== (';$12',)
[';$', ';$', ';$', 'foo', ';', 'bar', 'bar', ';', ';', ';', '$', '$', 'foo', ';$12'] <-CORRECT ONE
list(compress_bof(q,';$12')) :
t_compress== (';$12',)
[';$', ';$', ';$', 'foo', ';', 'bar', 'bar', ';', ';', ';', '$', '$', 'foo', ';$12']
list(compress_bug(q,';$12')) :
t_compress== ';$12'
[';$', 'foo', ';', 'bar', 'bar', ';', '$', 'foo', ';$12'] <====== error ====
我获得了以下执行时间:
0.390163274941 generator function with groupby()
0.324547114228 generator function eyquem
0.310176572721 generator function John Machin
['**', '**', 'su', 'foo', '*', 'bar', 'bar', '**', '**', '**', 'su', 'su', '**', 'bin', '*', '*', 'bar', 'bar', 'su', 'su', 'su']
a==b==c is True
['**', 'su', 'foo', '*', 'bar', 'bar', '**', 'su', 'su', '**', 'bin', '*', '*', 'bar', 'bar', 'su', 'su', 'su']
我更喜欢John Machin的解决方案,因为它没有像我的解决方案一样的指令B = iter(B)。
但是,
previous = _dummy
与
_dummy = object()
的指令对我来说看起来很奇怪。因此,最终我认为更好的解决方案是以下代码,即使使用字符串作为可迭代参数也可以正常工作,在其中定义的第一个对象previous不是假的:
def squeeze(iterable, victims):
if hasattr(iterable, '__iter__') and not hasattr(victims, '__iter__'):
victims = (victims,)
for item in iterable:
previous = item
break
for item in iterable:
if item in victims and item==previous:
continue
previous = item
yield item
.
编辑 3
我曾经理解object()被用作哨兵。
但是我对于object被调用的事实感到困惑。昨天,我认为object非常特殊,不可能出现在传递给squeeze()的任何可迭代对象中。因此,我想知道为什么您叫它John Machin,并且这让我对其性质产生了怀疑;这就是为什么我要求您确认object是超级元类的原因。
但是今天,我想我明白了为什么在您的代码中调用object。
实际上,object很可能出现在一个可迭代对象中,为什么不呢?超级元类object本身就是一个对象,因此在可迭代对象上进行去重之前,有可能已经将其放入了可迭代对象中。因此,使用object本身作为哨兵是不正确的做法。
.
所以你没有使用对象,而是使用了一个实例object()作为哨兵。
但我想知道为什么选择这个神秘的东西,即调用object的返回值是什么?
我的思考继续进行,我注意到了一个可能是这个调用原因的事情:
调用object会创建一个实例,因为object是Python中最基本的类,每次创建一个实例时,它都是与之前创建的任何实例不同的对象,并且其值始终不同于任何先前object的实例的值:
a = object()
b = object()
c = object()
d = object()
print id(a),'\n',id(b),'\n',id(c),'\n',id(d)
print a==b,a==c,a==d
print b==c,b==d,c==d
结果
10818752
10818760
10818768
10818776
False False False
False False False
所以可以确定
_dummy=object()
是一个唯一的对象,具有唯一的
id和唯一的值。顺便问一下,我想知道一个
object实例的值是什么。无论如何,以下代码展示了使用
_dummy=object
存在的问题,而使用
_dummy=object()
则没有问题。
def imperfect_squeeze(iterable, victim, _dummy=object):
previous = _dummy
print 'id(previous) ==',id(previous)
print 'id(iterable[0])==',id(iterable[0])
for item in iterable:
if item in victim and item==previous: continue
previous = item; yield item
def squeeze(iterable, victim, _dummy=object()):
previous = _dummy
print 'id(previous) ==',id(previous)
print 'id(iterable[0])==',id(iterable[0])
for item in iterable:
if item in victim and item==previous: continue
previous = item; yield item
wat = object
li = [wat,'**','**','foo',wat,wat]
print 'imperfect_squeeze\n''li before ==',li
print map(id,li)
li = list(imperfect_squeeze(li,[wat,'**']))
print 'li after ==',li
print
wat = object()
li = [wat,'**','**','foo',wat,wat]
print 'squeeze\n''li before ==',li
print map(id,li)
li = list(squeeze(li,[wat,'**']))
print 'li after ==',li
print
li = [object(),'**','**','foo',object(),object()]
print 'squeeze\n''li before ==',li
print map(id,li)
li = list(squeeze(li,[li[0],'**']))
print 'li after ==',li
结果
imperfect_squeeze
li before == [<type 'object'>, '**', '**', 'foo', <type 'object'>, <type 'object'>]
[505317320, 18578968, 18578968, 13208848, 505317320, 505317320]
id(previous) == 505317320
id(iterable[0])== 505317320
li after == ['**', 'foo', <type 'object'>]
squeeze
li before == [<object object at 0x00A514C8>, '**', '**', 'foo', <object object at 0x00A514C8>, <object object at 0x00A514C8>]
[10818760, 18578968, 18578968, 13208848, 10818760, 10818760]
id(previous) == 10818752
id(iterable[0])== 10818760
li after == [<object object at 0x00A514C8>, '**', 'foo', <object object at 0x00A514C8>]
squeeze
li before == [<object object at 0x00A514D0>, '**', '**', 'foo', <object object at 0x00A514D8>, <object object at 0x00A514E0>]
[10818768, 18578968, 18578968, 13208848, 10818776, 10818784]
id(previous) == 10818752
id(iterable[0])== 10818768
li after == [<object object at 0x00A514D0>, '**', 'foo', <object object at 0x00A514D8>, <object object at 0x00A514E0>]
问题在于经过 imperfect_squeeze() 处理后,列表中缺少 <type 'object'>
作为第一个元素。
然而,我们必须注意到,“问题”只可能出现在第一个元素是 object 的列表中:这对于如此微小的概率来说是很多反思……但是严谨的编码人员会考虑到所有情况。
如果我们使用 list 而不是 object,结果会有所不同:
def imperfect_sqlize(iterable, victim, _dummy=list):
previous = _dummy
print 'id(previous) ==',id(previous)
print 'id(iterable[0])==',id(iterable[0])
for item in iterable:
if item in victim and item==previous: continue
previous = item; yield item
def sqlize(iterable, victim, _dummy=list()):
previous = _dummy
print 'id(previous) ==',id(previous)
print 'id(iterable[0])==',id(iterable[0])
for item in iterable:
if item in victim and item==previous: continue
previous = item; yield item
wat = list
li = [wat,'**','**','foo',wat,wat]
print 'imperfect_sqlize\n''li before ==',li
print map(id,li)
li = list(imperfect_sqlize(li,[wat,'**']))
print 'li after ==',li
print
wat = list()
li = [wat,'**','**','foo',wat,wat]
print 'sqlize\n''li before ==',li
print map(id,li)
li = list(sqlize(li,[wat,'**']))
print 'li after ==',li
print
li = [list(),'**','**','foo',list(),list()]
print 'sqlize\n''li before ==',li
print map(id,li)
li = list(sqlize(li,[li[0],'**']))
print 'li after ==',li
结果
imperfect_sqlize
li before == [<type 'list'>, '**', '**', 'foo', <type 'list'>, <type 'list'>]
[505343304, 18578968, 18578968, 13208848, 505343304, 505343304]
id(previous) == 505343304
id(iterable[0])== 505343304
li after == ['**', 'foo', <type 'list'>]
sqlize
li before == [[], '**', '**', 'foo', [], []]
[18734936, 18578968, 18578968, 13208848, 18734936, 18734936]
id(previous) == 18734656
id(iterable[0])== 18734936
li after == ['**', 'foo', []]
sqlize
li before == [[], '**', '**', 'foo', [], []]
[18734696, 18578968, 18578968, 13208848, 18735016, 18734816]
id(previous) == 18734656
id(iterable[0])== 18734696
li after == ['**', 'foo', []]
在Python中,除了object之外,还有其他对象具有这种特性吗?
John Machin,为什么您在生成器函数中选择了object的实例作为哨兵?您是否已经知道上述特性?