数据如下所示:
![enter image description here](https://istack.dev59.com/CNxL7.webp)
import csv
f=open('xxx.csv','rb')
reader=csv.reader(f)
wt=open('lll.csv','wb')
writer=csv.writer(wt,quoting=csv.QUOTE_ALL)
wt.close()
f.close()
结果如下所示:
我应该怎么做来解决这个问题?
import csv
f=open('xxx.csv','rb')
reader=csv.reader(f)
wt=open('lll.csv','wb')
writer=csv.writer(wt,quoting=csv.QUOTE_ALL)
wt.close()
f.close()
结果如下所示:
我应该怎么做来解决这个问题?
另一种选择:
使用unicodecsv软件包中的代码...
https://pypi.python.org/pypi/unicodecsv/
>>> import unicodecsv as csv
>>> from io import BytesIO
>>> f = BytesIO()
>>> w = csv.writer(f, encoding='utf-8')
>>> _ = w.writerow((u'é', u'ñ'))
>>> _ = f.seek(0)
>>> r = csv.reader(f, encoding='utf-8')
>>> next(r) == [u'é', u'ñ']
True
这个模块与标准库的csv模块具有API兼容性。
请确保适当编码和解码。
此示例将utf-8编码的一些示例文本往返于csv文件,并重新演示出来:
# -*- coding: utf-8 -*-
import csv
tests={'German': [u'Straße',u'auslösen',u'zerstören'],
'French': [u'français',u'américaine',u'épais'],
'Chinese': [u'中國的',u'英語',u'美國人']}
with open('/tmp/utf.csv','w') as fout:
writer=csv.writer(fout)
writer.writerows([tests.keys()])
for row in zip(*tests.values()):
row=[s.encode('utf-8') for s in row]
writer.writerows([row])
with open('/tmp/utf.csv','r') as fin:
reader=csv.reader(fin)
for row in reader:
temp=list(row)
fmt=u'{:<15}'*len(temp)
print fmt.format(*[s.decode('utf-8') for s in temp])
输出:
German Chinese French
Straße 中國的 français
auslösen 英語 américaine
zerstören 美國人 épais
u'Straße'
,它们仍然在内部(转义为)ASCII(u'Stra\xdfe'
),因此您必须将所有内容转换/编码为UTF-8(转义字符串)('Stra\xc3\x9fe'
)才能将它们写入UTF-8编码的文件? - doncherry在csv模块文档的末尾有一个示例,演示了如何处理Unicode。下面是从该示例直接复制的内容。请注意,读取或写入的字符串将是Unicode字符串。例如,不要传递字节字符串到UnicodeWriter.writerows
。
import csv,codecs,cStringIO
class UTF8Recoder:
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader:
def __init__(self, f, dialect=csv.excel, encoding="utf-8-sig", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
'''next() -> unicode
This function reads and returns the next line as a Unicode string.
'''
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
class UnicodeWriter:
def __init__(self, f, dialect=csv.excel, encoding="utf-8-sig", **kwds):
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
'''writerow(unicode) -> None
This function takes a Unicode string and encodes it to the output.
'''
self.writer.writerow([s.encode("utf-8") for s in row])
data = self.queue.getvalue()
data = data.decode("utf-8")
data = self.encoder.encode(data)
self.stream.write(data)
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
with open('xxx.csv','rb') as fin, open('lll.csv','wb') as fout:
reader = UnicodeReader(fin)
writer = UnicodeWriter(fout,quoting=csv.QUOTE_ALL)
for line in reader:
writer.writerow(line)
输入(UTF-8编码):
American,美国人
French,法国人
German,德国人
输出:
"American","美国人"
"French","法国人"
"German","德国人"
self.writer.writerow([s.encode("utf-8") for s in row])
中遇到了 UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 1: ordinal not in range(128)
的错误。请问有什么建议吗? - AhsanUnicodeDecodeError
。这意味着s
最初不是Unicode,因此Python 2.X使用默认的ascii
编解码为Unicode。确保将Unicode字符串传递给UnicodeWriter
。 - Mark Tolonen在Python2中,str
实际上是bytes
类型。因此,如果想要将unicode
写入CSV文件,必须使用utf-8
编码将unicode
转换为str
。
def py2_unicode_to_str(u):
# unicode is only exist in python2
assert isinstance(u, unicode)
return u.encode('utf-8')
class csv.DictWriter(csvfile, fieldnames, restval='', extrasaction='raise', dialect='excel', *args, **kwds)
:
csvfile
: open(fp, 'w')
bytes
传递,这些字节使用utf-8
进行编码
writer.writerow({py2_unicode_to_str(k): py2_unicode_to_str(v) for k,v in row.items()})
csvfile
: open(fp, 'w')
str
的普通字典作为row
传递到writer.writerow(row)
import sys
is_py2 = sys.version_info[0] == 2
def py2_unicode_to_str(u):
# unicode is only exist in python2
assert isinstance(u, unicode)
return u.encode('utf-8')
with open('file.csv', 'w') as f:
if is_py2:
data = {u'Python中国': u'Python中国', u'Python中国2': u'Python中国2'}
# just one more line to handle this
data = {py2_unicode_to_str(k): py2_unicode_to_str(v) for k, v in data.items()}
fields = list(data[0])
writer = csv.DictWriter(f, fieldnames=fields)
for row in data:
writer.writerow(row)
else:
data = {'Python中国': 'Python中国', 'Python中国2': 'Python中国2'}
fields = list(data[0])
writer = csv.DictWriter(f, fieldnames=fields)
for row in data:
writer.writerow(row)
在Python3中,只需使用Unicode的str
。
在Python2中,使用unicode
来处理文本,在I/O发生时使用str
。
我曾经遇到同样的问题。答案是你已经做得很正确了。这是 MS Excel 的问题。尝试使用另一个编辑器打开文件,你会发现你的编码已经成功了。为了让 MS Excel 满意,请从 UTF-8 切换到 UTF-16。这应该可以解决问题:
class UnicodeWriter:
def __init__(self, f, dialect=csv.excel_tab, encoding="utf-16", **kwds):
# Redirect output to a queue
self.queue = StringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
# Force BOM
if encoding=="utf-16":
import codecs
f.write(codecs.BOM_UTF16)
self.encoding = encoding
def writerow(self, row):
# Modified from original: now using unicode(s) to deal with e.g. ints
self.writer.writerow([unicode(s).encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = data.encode(self.encoding)
# strip BOM
if self.encoding == "utf-16":
data = data[2:]
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
我无法回复Mark的留言,但我刚刚做了一个修改,修复了如果单元格中的数据不是Unicode(例如浮点或整数数据)而导致的错误。我将以下行替换为UnicodeWriter函数:“self.writer.writerow([s.encode("utf-8") if type(s)==types.UnicodeType else s for s in row])”,使其变成:
class UnicodeWriter:
def __init__(self, f, dialect=csv.excel, encoding="utf-8-sig", **kwds):
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
'''writerow(unicode) -> None
This function takes a Unicode string and encodes it to the output.
'''
self.writer.writerow([s.encode("utf-8") if type(s)==types.UnicodeType else s for s in row])
data = self.queue.getvalue()
data = data.decode("utf-8")
data = self.encoder.encode(data)
self.stream.write(data)
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
我认为这不是最好的答案,但可能是最自包含且最有趣的答案。
UTF7 是 Unicode 的 7 位 ASCII 编码。恰好 UTF7 不会特别使用逗号、引号或空格,它只是将它们从输入传递到输出。因此,如果您先进行 UTF7 编码然后解析为 CSV,或者先解析为 CSV 然后再进行 UTF7 编码,实际上并没有区别。Python 2 的 CSV 解析器无法处理 Unicode,但 Python 2 确实有一个 UTF-7 编码器。所以你可以先编码、解析,然后解码,就好像你有一个支持 Unicode 的解析器。
import csv
import io
def read_csv(path):
with io.open(path, 'rt', encoding='utf8') as f:
lines = f.read().split("\r\n")
lines = [l.encode('utf7').decode('ascii') for l in lines]
reader = csv.reader(lines, dialect=csv.excel)
for row in reader:
yield [x.encode('ascii').decode('utf7') for x in row]
for row in read_csv("lol.csv"):
print(repr(row))
lol.csv
foo,bar,foo∆bar,"foo,bar"
输出:
[u'foo', u'bar', u'foo\u2206bar', u'foo,bar']