我不关心它们之间的差异,我只想知道内容是否不同。
较底层的方法:
from __future__ import with_statement
with open(filename1) as f1:
with open(filename2) as f2:
if f1.read() == f2.read():
...
高层次方法:
import filecmp
if filecmp.cmp(filename1, filename2, shallow=False):
...
如果你想要基本的效率,你可能首先要检查文件大小:
if os.path.getsize(filename1) == os.path.getsize(filename2):
if open('filename1','r').read() == open('filename2','r').read():
# Files are the same.
这可以节省您阅读两个大小不同且不能相同的文件的每一行的时间。
(甚至更进一步的是,您可以调用每个文件的快速MD5sum并比较它们,但这不是“在Python中”,所以我就到此为止了。)
这是一个函数式的文件比较函数。如果文件大小不同,它会立即返回 False;否则,它以 4KiB 块大小读取文件,并在第一个不同之处立即返回 False:
from __future__ import with_statement
import os
import itertools, functools, operator
try:
izip= itertools.izip # Python 2
except AttributeError:
izip= zip # Python 3
def filecmp(filename1, filename2):
"Do the two files have exactly the same contents?"
with open(filename1, "rb") as fp1, open(filename2, "rb") as fp2:
if os.fstat(fp1.fileno()).st_size != os.fstat(fp2.fileno()).st_size:
return False # different sizes ∴ not equal
# set up one 4k-reader for each file
fp1_reader= functools.partial(fp1.read, 4096)
fp2_reader= functools.partial(fp2.read, 4096)
# pair each 4k-chunk from the two readers while they do not return '' (EOF)
cmp_pairs= izip(iter(fp1_reader, b''), iter(fp2_reader, b''))
# return True for all pairs that are not equal
inequalities= itertools.starmap(operator.ne, cmp_pairs)
# voilà; any() stops at first True value
return not any(inequalities)
if __name__ == "__main__":
import sys
print filecmp(sys.argv[1], sys.argv[2])
另一种不同的观点 :)
由于我无法评论其他人的答案,所以我会写下自己的答案。
如果您使用md5,绝对不能只使用md5.update(f.read()),因为这样会占用过多内存。
def get_file_md5(f, chunk_size=8192):
h = hashlib.md5()
while True:
chunk = f.read(chunk_size)
if not chunk:
break
h.update(chunk)
return h.hexdigest()
import hashlib
def checksum(f):
md5 = hashlib.md5()
md5.update(open(f).read())
return md5.hexdigest()
def is_contents_same(f1, f2):
return checksum(f1) == checksum(f2)
if not is_contents_same('foo.txt', 'bar.txt'):
print 'The contents are not the same!'
f = 打开(filename1, "r").读取()
f2 = 打开(filename2,"r").读取()
打印 f == f2
在这段代码中,打开两个文件并将它们作为字符串进行读取。然后检查这两个字符串是否相等并打印结果。from __future__ import with_statement
filename1 = "G:\\test1.TXT"
filename2 = "G:\\test2.TXT"
with open(filename1) as f1:
with open(filename2) as f2:
file1list = f1.read().splitlines()
file2list = f2.read().splitlines()
list1length = len(file1list)
list2length = len(file2list)
if list1length == list2length:
for index in range(len(file1list)):
if file1list[index] == file2list[index]:
print file1list[index] + "==" + file2list[index]
else:
print file1list[index] + "!=" + file2list[index]+" Not-Equel"
else:
print "difference inthe size of the file and number of lines"
import os
def is_file_content_equal(
file_path_1: str, file_path_2: str, buffer_size: int = 1024 * 8
) -> bool:
"""Checks if two files content is equal
Arguments:
file_path_1 (str): Path to the first file
file_path_2 (str): Path to the second file
buffer_size (int): Size of the buffer to read the file
Returns:
bool that indicates if the file contents are equal
Example:
>>> is_file_content_equal("filecomp.py", "filecomp copy.py")
True
>>> is_file_content_equal("filecomp.py", "diagram.dio")
False
"""
# First check sizes
s1, s2 = os.path.getsize(file_path_1), os.path.getsize(file_path_2)
if s1 != s2:
return False
# If the sizes are the same check the content
with open(file_path_1, "rb") as fp1, open(file_path_2, "rb") as fp2:
while True:
b1 = fp1.read(buffer_size)
b2 = fp2.read(buffer_size)
if b1 != b2:
return False
# if the content is the same and they are both empty bytes
# the file is the same
if not b1:
return True
filecmp
非常适合用于简单比较文件,但无法打印文件中的行号或差异。import filecmp
def compare_files(filename1, filename2):
return filecmp.cmp(filename1, filename2, shallow=False)
def compare_with_line_diff(filename1, filename2):
with open(filename1, "r") as file1, open(filename2, "r") as file2:
# Loop for all lines in first file (keep only 2 lines in memory)
for line_num, f1_line in enumerate(file1, start=1):
# Only print status for range of lines
if (line_num == 1 or line_num % 1000 == 0):
print(f"comparing lines {line_num} to {line_num + 1000}")
# Compare with next line of file2
f2_line = file2.readline()
if (f1_line != f2_line):
print(f"Difference on line: {line_num}")
print(f"f1_line: '{f1_line}'")
print(f"f2_line: '{f2_line}'")
return False
# Check if file2 has more lines than file1
for extra_line in file2:
print(f"Difference on file2: {extra_line}")
return False
# Files are equal
return True
filecmp.cmp()
函数除了比较文件内容外,还会比较inode号码、ctime以及其他统计信息。在我的应用中,这是不希望出现的。如果只想比较文件内容而不比较文件统计信息,则使用f1.read() == f2.read()
可能是更好的方式。 - Ray