Python list对象常用方法深度分析

站长

2024年04月22日 15:59 · 阅读数 54

append()

源文件：Objects/listobject.c

static PyObject *
list_append(PyListObject *self, PyObject *object)
/*[clinic end generated code: output=7c096003a29c0eae input=43a3fe48a7066e91]*/
{
    if (_PyList_AppendTakeRef(self, Py_NewRef(object)) < 0) {
        return NULL;
    }
    Py_RETURN_NONE;
}

static inline int
_PyList_AppendTakeRef(PyListObject *self, PyObject *newitem)
{
    assert(self != NULL && newitem != NULL);
    assert(PyList_Check(self));
    Py_ssize_t len = PyList_GET_SIZE(self);
    Py_ssize_t allocated = self->allocated;
    assert((size_t)len + 1 < PY_SSIZE_T_MAX);
    if (allocated > len) {
        PyList_SET_ITEM(self, len, newitem);
        Py_SET_SIZE(self, len + 1);
        return 0;
    }
    return _PyList_AppendTakeRefListResize(self, newitem);
}

简单解释一下：

开始断言，确保传入的参数都不为 NULL，确保传入的 self 参数是一个合法的列表对象
使用 PyList_GET_SIZE() 函数获取当前列表对象的长度，并将其存储在 len 变量中
获取当前列表对象已经分配的内存大小，并将其存储在 allocated 变量中。这个变量可以帮助确定是否需要重新分配更多的内存来容纳新元素
断言当前列表对象的长度加上新元素的长度不会超过 PY_SSIZE_T_MAX，这是一个编译时常量，表示可用于任何一个 Python 对象的最大内存大小。
如果当前分配的内存可以容纳新元素，则将新元素添加到列表对象中，并更新列表对象的长度。
如果当前分配的内存不能容纳新元素，则调用 _PyList_AppendTakeRefListResize() 函数来重新调整列表对象的大小，并添加新元素.

int
_PyList_AppendTakeRefListResize(PyListObject *self, PyObject *newitem)
{
    Py_ssize_t len = PyList_GET_SIZE(self);
    assert(self->allocated == -1 || self->allocated == len);
    if (list_resize(self, len + 1) < 0) {
        Py_DECREF(newitem);
        return -1;
    }
    PyList_SET_ITEM(self, len, newitem);
    return 0;
}

可以看到，应该是调用list_resize方法进行扩容的。我看通过源码看看扩容策略

static int
list_resize(PyListObject *self, Py_ssize_t newsize)
{
    PyObject **items;
    size_t new_allocated, num_allocated_bytes;
    Py_ssize_t allocated = self->allocated;

    /* Bypass realloc() when a previous overallocation is large enough
       to accommodate the newsize.  If the newsize falls lower than half
       the allocated size, then proceed with the realloc() to shrink the list.
    */
    if (allocated >= newsize && newsize >= (allocated >> 1)) {
        assert(self->ob_item != NULL || newsize == 0);
        Py_SET_SIZE(self, newsize);
        return 0;
    }

    /* This over-allocates proportional to the list size, making room
     * for additional growth.  The over-allocation is mild, but is
     * enough to give linear-time amortized behavior over a long
     * sequence of appends() in the presence of a poorly-performing
     * system realloc().
     * Add padding to make the allocated size multiple of 4.
     * The growth pattern is:  0, 4, 8, 16, 24, 32, 40, 52, 64, 76, ...
     * Note: new_allocated won't overflow because the largest possible value
     *       is PY_SSIZE_T_MAX * (9 / 8) + 6 which always fits in a size_t.
     */
    new_allocated = ((size_t)newsize + (newsize >> 3) + 6) & ~(size_t)3;
    /* Do not overallocate if the new size is closer to overallocated size
     * than to the old size.
     */
    if (newsize - Py_SIZE(self) > (Py_ssize_t)(new_allocated - newsize))
        new_allocated = ((size_t)newsize + 3) & ~(size_t)3;

    if (newsize == 0)
        new_allocated = 0;
    if (new_allocated <= (size_t)PY_SSIZE_T_MAX / sizeof(PyObject *)) {
        num_allocated_bytes = new_allocated * sizeof(PyObject *);
        items = (PyObject **)PyMem_Realloc(self->ob_item, num_allocated_bytes);
    }
    else {
        // integer overflow
        items = NULL;
    }
    if (items == NULL) {
        PyErr_NoMemory();
        return -1;
    }
    self->ob_item = items;
    Py_SET_SIZE(self, newsize);
    self->allocated = new_allocated;
    return 0;
}

简单解释一下源码，关键点：

当 allocated // 2 <= newsize <= allocated时，列表仅仅插入新元素，不进行扩容；只要newsize不在这个范围内时，才需要执行扩容
扩容公式：new_allocated =newsize + (newsize >> 3) + 6) & ~(size_t)3，也就是new_allocated =newsize + newsize // 8 + 3 if newsize < 9 else 6

我们可以自己演算一下扩容空间的变化

列表为空时，allocated = 0
new_size    allocated//2 <= newsize <= allocated    newsize//8 + 3 if newsize < 9 else 6    allocated
  1                         false                         0 + 3 = 3                           1 + 3 = 4 
  2～4                       true                          不扩容                               4
  5                         false                         0 + 3 = 3                           5 + 3 = 8 6～8                       true                            不扩容                               8
  9                         false                         1 + 6 = 7                         9 + 7 = 16
  ...

为了验证推测，我们加入输出代码重新编译python

static int
list_resize(PyListObject *self, Py_ssize_t newsize)
{
    PyObject **items;
    size_t new_allocated, num_allocated_bytes;
    Py_ssize_t allocated = self->allocated;

    /* Bypass realloc() when a previous overallocation is large enough
       to accommodate the newsize.  If the newsize falls lower than half
       the allocated size, then proceed with the realloc() to shrink the list.
    */
    if (allocated >= newsize && newsize >= (allocated >> 1)) {
        assert(self->ob_item != NULL || newsize == 0);
        Py_SET_SIZE(self, newsize);
        return 0;
    }

    /* This over-allocates proportional to the list size, making room
     * for additional growth.  The over-allocation is mild, but is
     * enough to give linear-time amortized behavior over a long
     * sequence of appends() in the presence of a poorly-performing
     * system realloc().
     * Add padding to make the allocated size multiple of 4.
     * The growth pattern is:  0, 4, 8, 16, 24, 32, 40, 52, 64, 76, ...
     * Note: new_allocated won't overflow because the largest possible value
     *       is PY_SSIZE_T_MAX * (9 / 8) + 6 which always fits in a size_t.
     */
    new_allocated = ((size_t)newsize + (newsize >> 3) + 6) & ~(size_t)3;
    PyObject *str = PyUnicode_FromString("开始扩容啦");
    PyObject_Print(str, stdout, 0);
    printf("\n");
    printf("new_allocated     = %d\n", new_allocated);

    /* Do not overallocate if the new size is closer to overallocated size
     * than to the old size.
     */
    if (newsize - Py_SIZE(self) > (Py_ssize_t)(new_allocated - newsize))
        new_allocated = ((size_t)newsize + 3) & ~(size_t)3;
    // ......
}

重新编译后看看那append操作allocated的变化：

l = []
l.append(1) # 开始扩容啦  allocated     = 4
l.append(2) # 未执行该方法
l.append(3) # 未执行该方法
l.append(4) # 未执行该方法
l.append(5) # 开始扩容啦  allocated     = 8
l.append(6) # 未执行该方法
l.append(7) # 未执行该方法
l.append(8) # 未执行该方法
l.append(9) # 开始扩容啦  allocated     = 16

可以看到输出结果和我们的推测结果是一致的。

insert()

源文件：Objects/listobject.c.h

static PyObject *
list_insert(PyListObject *self, PyObject *const *args, Py_ssize_t nargs)
{
    PyObject *return_value = NULL;
    Py_ssize_t index;
    PyObject *object;

    if (!_PyArg_CheckPositional("insert", nargs, 2, 2)) {
        goto exit;
    }
    {
        Py_ssize_t ival = -1;
        PyObject *iobj = _PyNumber_Index(args[0]);
        if (iobj != NULL) {
            ival = PyLong_AsSsize_t(iobj);
            Py_DECREF(iobj);
        }
        if (ival == -1 && PyErr_Occurred()) {
            goto exit;
        }
        index = ival;
    }
    object = args[1];
    return_value = list_insert_impl(self, index, object);

exit:
    return return_value;
}

源文件：Objects/listobject.c

static PyObject *
list_insert_impl(PyListObject *self, Py_ssize_t index, PyObject *object)
/*[clinic end generated code: output=7f35e32f60c8cb78 input=858514cf894c7eab]*/
{
    if (ins1(self, index, object) == 0)
        Py_RETURN_NONE;
    return NULL;
}

static int
ins1(PyListObject *self, Py_ssize_t where, PyObject *v)
{
    Py_ssize_t i, n = Py_SIZE(self);
    PyObject **items;
    if (v == NULL) {
        PyErr_BadInternalCall();
        return -1;
    }

    assert((size_t)n + 1 < PY_SSIZE_T_MAX);
    if (list_resize(self, n+1) < 0)
        return -1;

    if (where < 0) {
        where += n;
        if (where < 0)
            where = 0;
    }
    if (where > n)
        where = n;
    items = self->ob_item;
    for (i = n; --i >= where; )
        items[i+1] = items[i];
    Py_INCREF(v);
    items[where] = v;
    return 0;
}

我们看一下关键部分

    if (where < 0) {
        where += n;
        if (where < 0)
            where = 0;
    }
    if (where > n)
        where = n;
    items = self->ob_item;
    for (i = n; --i >= where; )
        items[i+1] = items[i];

如果下标为负数，则+n，如果+n还是小于0，就将where设置为0，也就是插入到列表的头部

如果下标>0,且大于列表长度，就将where设置为列表长度，也就是插入到列表的尾部

最后将插入位置以后的元素逐一往后移动一个位置。

l = [1,2,3]
l.insert(-4,0) 
l # [0, 1, 2, 3]
l.insert(4,0) 
l # [0, 1, 2, 3, 0]

pop()

源文件：Objects/listobject.c.h

static PyObject *
list_pop(PyListObject *self, PyObject *const *args, Py_ssize_t nargs)
{
    PyObject *return_value = NULL;
    Py_ssize_t index = -1;

    if (!_PyArg_CheckPositional("pop", nargs, 0, 1)) {
        goto exit;
    }
    if (nargs < 1) {
        goto skip_optional;
    }
    {
        Py_ssize_t ival = -1;
        PyObject *iobj = _PyNumber_Index(args[0]);
        if (iobj != NULL) {
            ival = PyLong_AsSsize_t(iobj);
            Py_DECREF(iobj);
        }
        if (ival == -1 && PyErr_Occurred()) {
            goto exit;
        }
        index = ival;
    }
skip_optional:
    return_value = list_pop_impl(self, index);

exit:
    return return_value;
}

从这段源码可以看到，index默认传的-1，也就是默认删除最后一个元素

l = [1,2,3]
l.pop() # 3

源文件：Objects/listobject.c

static PyObject *
list_pop_impl(PyListObject *self, Py_ssize_t index)
/*[clinic end generated code: output=6bd69dcb3f17eca8 input=b83675976f329e6f]*/
{
    PyObject *v;
    int status;

    if (Py_SIZE(self) == 0) {
        /* Special-case most common failure cause */
        PyErr_SetString(PyExc_IndexError, "pop from empty list");
        return NULL;
    }
    if (index < 0)
        index += Py_SIZE(self);
    if (!valid_index(index, Py_SIZE(self))) {
        PyErr_SetString(PyExc_IndexError, "pop index out of range");
        return NULL;
    }
    v = self->ob_item[index];
    if (index == Py_SIZE(self) - 1) {
        status = list_resize(self, Py_SIZE(self) - 1);
        if (status >= 0)
            return v; /* and v now owns the reference the list had */
        else
            return NULL;
    }
    Py_INCREF(v);
    status = list_ass_slice(self, index, index+1, (PyObject *)NULL);
    if (status < 0) {
        Py_DECREF(v);
        return NULL;
    }
    return v;
}

如果列表为空，报错

如果下标为负数，则加上列表长度，转化为普通下标

如果下标不合法，报错

如果下标等于列表长度-1，调用list_resize处理，否则调用list_ass_slice删除元素

remove()

源文件：Objects/listobject.c

static PyObject *
list_remove(PyListObject *self, PyObject *value)
/*[clinic end generated code: output=f087e1951a5e30d1 input=2dc2ba5bb2fb1f82]*/
{
    Py_ssize_t i;

    for (i = 0; i < Py_SIZE(self); i++) {
        PyObject *obj = self->ob_item[i];
        Py_INCREF(obj);
        int cmp = PyObject_RichCompareBool(obj, value, Py_EQ);
        Py_DECREF(obj);
        if (cmp > 0) {
            if (list_ass_slice(self, i, i+1,
                               (PyObject *)NULL) == 0)
                Py_RETURN_NONE;
            return NULL;
        }
        else if (cmp < 0)
            return NULL;
    }
    PyErr_SetString(PyExc_ValueError, "list.remove(x): x not in list");
    return NULL;
}

通过循环查找要删除的元素，找到后调用list_ass_slice删除元素，没找到，报错

copy()

这是一个浅拷贝

源文件：Objects/listobject.c

static PyObject *
list_copy_impl(PyListObject *self)
/*[clinic end generated code: output=ec6b72d6209d418e input=6453ab159e84771f]*/
{
    return list_slice(self, 0, Py_SIZE(self));
}

static PyObject *
list_slice(PyListObject *a, Py_ssize_t ilow, Py_ssize_t ihigh)
{
    PyListObject *np;
    PyObject **src, **dest;
    Py_ssize_t i, len;
    len = ihigh - ilow;
    if (len <= 0) {
        return PyList_New(0);
    }
    np = (PyListObject *) list_new_prealloc(len);
    if (np == NULL)
        return NULL;

    src = a->ob_item + ilow;
    dest = np->ob_item;
    for (i = 0; i < len; i++) {
        PyObject *v = src[i];
        Py_INCREF(v);
        dest[i] = v;
    }
    Py_SET_SIZE(np, len);
    return (PyObject *)np;
}

可以看到copy方法的本质是进行切片操作

该函数接受三个参数:

a:要切片的列表对象。 ilow:切片的起始索引。 ihigh:切片的结束索引。该函数首先使用len = ihigh - ilow计算切片的长度。如果len小于或等于0，则使用PyList_New返回一个空列表。

否则，它将使用list_new_prealloc创建一个与切片大小相同的新列表对象，这是list_new的更快版本，允许预分配列表存储以提高性能。

然后，它使用遍历片中的每个项的循环将原始列表中的项复制到新列表中。对于每个项，它使用Py_INCREF增加其引用计数，并使用dest指针将其分配给新列表中的相应位置。

最后，它使用Py_SET_SIZE设置新列表的大小，它更新PyVarObject_HEAD宏中的ob_size字段以反映列表中的项数，并返回新列表。

注意，这个函数不会修改原始列表对象，而是创建一个新对象，它是原始列表对象切片的浅拷贝。

我们知道list对象的结构体中数组保存的是元素的指针，所以使用copy()复制底层数组时，拷贝的也是元素对象的指针，而不是具体的对象。

结合python程序，看看每句话是啥意思

这个函数不会修改原始列表对象，而是创建一个新对象

l1 = [1,[1,2,3]]
id(l1) # 4449821440
l2 = l1.copy()
id(l2) # 4449821440

可以看到内存地址是不一样的。

copy()复制底层数组时，拷贝的也是元素对象的指针，而不是具体的对象

l1 = [1,[1,2,3]]
id(l1[1]) # 4449821376
l2 = l1.copy()
id(l2[1]) # 4449821376
l2[1].append(4)
l2[1] # [1, 2, 3, 4]
l1[1] # [1, 2, 3, 4]
l2[0] = 2
l2 # [2, [1, 2, 3, 4]]
l1 # [1, [1, 2, 3, 4]]

可以看到，元素的内存地址是一样的，这样修改可变对象时，原始列表也会发生改变。修改不可变对象，对原列表无影响

深拷贝

既然浅拷贝会出现一些副作用，那怎么连元素对象一起拷贝呢？可以通过copy模块中的deepcopy()函数可以进行深拷贝

源文件：Lib/copy.py 我自己针对list类型进行简化版本

def deepcopy(x, memo=None, _nil=[]):
    if memo is None:
        memo = {}

    d = id(x)
    y = memo.get(d, _nil)
    if y is not _nil:
        return y

    cls = type(x)
    copier = _deepcopy_dispatch.get(cls)
    if copier is not None:
        y = copier(x, memo)
    # If is its own copy, don't memoize.
    if y is not x:
        memo[d] = y
        _keep_alive(x, memo) # Make sure x lives at least as long as d
    return y

_deepcopy_dispatch = d = {}

def _deepcopy_atomic(x, memo):
    return x

d[int] = _deepcopy_atomic

def _deepcopy_list(x, memo, deepcopy=deepcopy):
    y = []
    memo[id(x)] = y
    append = y.append
    for a in x:
        append(deepcopy(a, memo))
    return y
d[list] = _deepcopy_list

def _keep_alive(x, memo):
    try:
        memo[id(memo)].append(x)
    except KeyError:
        memo[id(memo)]=[x]

l = [1, [1,2,3]]
deepcopy(l)

可以看到会通过递归来复制所有对象，因此可以避免浅拷贝可能引起的数据共享问题。刚兴趣可以debug看一下

from copy import deepcopy
l1 = [1,[1,2,3]]
id(l1[1]) # 4450996032
l2 = deepcopy(l1)
id(l2[1]) # 4449860096
l2[1].append(4)
l2 # [1, [1, 2, 3, 4]]
l1 # [1, [1, 2, 3]]
l2[0] = 2
l2 # [2, [1, 2, 3, 4]]
l1 # [1, [1, 2, 3]]

可以看到使用深度拷贝，改变可变对象的数据，对原列表没有影响。

总结： deepcopy()函数仅适用于可变对象，例如列表、字典等。对于不可变对象，例如数字、字符串和元组，深拷贝和浅拷贝没有区别，因为无论是深拷贝还是浅拷贝都将创建一个指向同一内存地址的新对象。

想要第一时间看到最新文章，可以关注公众号：郝同学的测开日记

转载自:https://juejin.cn/post/7225879731178455095