Python中的set
对象由以下C结构表示。
typedef struct {
PyObject_HEAD
Py_ssize_t fill;
Py_ssize_t used;
Py_ssize_t mask;
setentry *table;
Py_hash_t hash;
Py_ssize_t finger;
setentry smalltable[PySet_MINSIZE];
PyObject *weakreflist;
} PySetObject;
现在记住,getsizeof()
调用对象的__sizeof__
方法,并在对象由垃圾收集器管理时增加额外的垃圾收集器开销。
好的,set
实现了__sizeof__
。
static PyObject *
set_sizeof(PySetObject *so, PyObject *Py_UNUSED(ignored))
{
Py_ssize_t res;
res = _PyObject_SIZE(Py_TYPE(so));
if (so->table != so->smalltable)
res = res + (so->mask + 1) * sizeof(setentry);
return PyLong_FromSsize_t(res);
}
现在让我们来检查一下这行代码。
res = _PyObject_SIZE(Py_TYPE(so));
_PyObject_SIZE
只是一个宏,它会展开为
(typeobj)->tp_basicsize
。
#define _PyObject_SIZE(typeobj) ( (typeobj)->tp_basicsize )
这段代码本质上是试图访问
tp_basicsize
槽位,以获取该类型实例的字节大小,而在
set
情况下,它只是
sizeof(PySetObject)
。
PyTypeObject PySet_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
"set",
sizeof(PySetObject),
0,
# Skipped rest of the code for brevity.
我已经对
set_sizeof
C函数进行了以下改动。
static PyObject *
set_sizeof(PySetObject *so, PyObject *Py_UNUSED(ignored))
{
Py_ssize_t res;
unsigned long py_object_head_size = sizeof(so->ob_base);
unsigned long fill_size = sizeof(so->fill);
unsigned long used_size = sizeof(so->used);
unsigned long mask_size = sizeof(so->mask);
unsigned long table_size = sizeof(so->table);
unsigned long hash_size = sizeof(so->hash);
unsigned long finger_size = sizeof(so->finger);
unsigned long smalltable_size = sizeof(so->smalltable);
unsigned long weakreflist_size = sizeof(so->weakreflist);
int is_using_fixed_size_smalltables = so->table == so->smalltable;
printf("| PySetObject Fields | Size(bytes) |\n");
printf("|------------------------------------|\n");
printf("| PyObject_HEAD | '%zu' |\n", py_object_head_size);
printf("| fill | '%zu' |\n", fill_size);
printf("| used | '%zu' |\n", used_size);
printf("| mask | '%zu' |\n", mask_size);
printf("| table | '%zu' |\n", table_size);
printf("| hash | '%zu' |\n", hash_size);
printf("| finger | '%zu' |\n", finger_size);
printf("| smalltable | '%zu' |\n", smalltable_size);
printf("| weakreflist | '%zu' |\n", weakreflist_size);
printf("-------------------------------------|\n");
printf("| Total | '%zu' |\n", py_object_head_size+fill_size+used_size+mask_size+table_size+hash_size+finger_size+smalltable_size+weakreflist_size);
printf("\n");
printf("Total size of PySetObject '%zu' bytes\n", sizeof(PySetObject));
printf("Has set resized: '%s'\n", is_using_fixed_size_smalltables ? "No": "Yes");
if(!is_using_fixed_size_smalltables) {
printf("Size of malloc'ed table: '%zu' bytes\n", (so->mask + 1) * sizeof(setentry));
}
res = _PyObject_SIZE(Py_TYPE(so));
if (so->table != so->smalltable)
res = res + (so->mask + 1) * sizeof(setentry);
return PyLong_FromSsize_t(res);
}
编译和运行这些更改后给我
>>> import sys
>>>
>>> set_ = set()
>>> sys.getsizeof(set_)
| PySetObject Fields | Size(bytes) |
|------------------------------------|
| PyObject_HEAD | '16' |
| fill | '8' |
| used | '8' |
| mask | '8' |
| table | '8' |
| hash | '8' |
| finger | '8' |
| smalltable | '128' |
| weakreflist | '8' |
-------------------------------------|
| Total | '200' |
Total size of PySetObject '200' bytes
Has set resized: 'No'
216
>>> set_.add(1)
>>> set_.add(2)
>>> set_.add(3)
>>> set_.add(4)
>>> set_.add(5)
>>> sys.getsizeof(set_)
| PySetObject Fields | Size(bytes) |
|------------------------------------|
| PyObject_HEAD | '16' |
| fill | '8' |
| used | '8' |
| mask | '8' |
| table | '8' |
| hash | '8' |
| finger | '8' |
| smalltable | '128' |
| weakreflist | '8' |
-------------------------------------|
| Total | '200' |
Total size of PySetObject '200' bytes
Has set resized: 'Yes'
Size of malloc'ed table: '512' bytes
728
返回值是216/728字节,因为sys.getsize
增加了16
个字节的GC开销。
但这里需要注意的重要事情是这一行。
| smalltable | '128' |
因为对于小表(在第一次调整大小之前),
so->table
只是
一个引用到
固定大小(8
)的so->smalltable
(没有分配内存),所以
sizeof(PySetObject)
足够获取大小,因为它还包括存储大小(
128(16(setentry的大小) * 8)
)。
当调整大小发生时会发生什么?它会构建一个全新的表(malloc'ed)并使用该表而不是 so->smalltables。这意味着已调整大小的集合还会携带128字节的死载荷(固定大小小表的大小),以及malloc'ed so->table的大小。
else {
newtable = PyMem_NEW(setentry, newsize);
if (newtable == NULL) {
PyErr_NoMemory();
return -1;
}
}
assert(newtable != oldtable);
memset(newtable, 0, sizeof(setentry) * newsize);
so->mask = newsize - 1;
so->table = newtable;
_PyObject_SIZE(Py_TYPE(so))
中,这是否意味着任何已调整大小的集合都会将小表计算两次,并且set_sizeof
会超过PySet_MINSIZE * 16 = 128字节的大小?还是说已调整大小的集合只是携带着128字节的死重量,对于它们不再使用的小表而言? - wimset_table_resize
中的代码(https://github.com/python/cpython/blob/v3.11.0/Objects/setobject.c#L285),那么它应该是后者。 - wim