pandas如何根据某列不同值，对其他列用不同的bins和labels进行cut函数切割？

站长

2024年07月03日 08:46 · 阅读数 22

问题描述

有一个DataFrame，我根据某一列‘type’的不同，对‘value’一列进行cut切割分组，赋予不同的分组值。但如果bins和labels数值或者长度不一致，就无法在同一个df里继续操作。

问题出现的环境背景及自己尝试过哪些方法

我尝试用apply的方式去解决，但并没有用好cut函数。

相关代码

import pandas as pd

# 创建示例数据
df = pd.DataFrame({
    'value': [5, 10, 15, 20, 25, 30],
    'type': [1, 1, 1, 2, 2, 2]
})

# 定义两个bins
bins1 = [0, 6, 20]
labels1=[3,2]
bins2 = [0, 20, 22, 50]
labels2=[3,2,1]

# 对type=1的数据使用bins1进行cut，对type=2的数据使用bins2进行cut
df.loc[df['type'] == 1, 'group'] = pd.cut(df.loc[df['type'] == 1, 'value'], bins=bins1, labels=labels1)
df.loc[df['type'] == 2, 'group'] = pd.cut(df.loc[df['type'] == 2, 'value'], bins=bins2, labels=labels2)

# 输出结果
print(df)

报错信息如下：

TypeError                                 Traceback (most recent call last)
Cell In[111], line 17
     15 # 对type=1的数据使用bins1进行cut，对type=2的数据使用bins2进行cut
     16 df.loc[df['type'] == 1, 'group'] = pd.cut(df.loc[df['type'] == 1, 'value'], bins=bins1, labels=labels1)
---> 17 df.loc[df['type'] == 2, 'group'] = pd.cut(df.loc[df['type'] == 2, 'value'], bins=bins2, labels=labels2)
     19 # 输出结果
     20 print(df)

File F:\Anaconda\lib\site-packages\pandas\core\indexing.py:818, in _LocationIndexer.__setitem__(self, key, value)
    815 self._has_valid_setitem_indexer(key)
    817 iloc = self if self.name == "iloc" else self.obj.iloc
--> 818 iloc._setitem_with_indexer(indexer, value, self.name)

File F:\Anaconda\lib\site-packages\pandas\core\indexing.py:1795, in _iLocIndexer._setitem_with_indexer(self, indexer, value, name)
   1792 # align and set the values
   1793 if take_split_path:
   1794     # We have to operate column-wise
-> 1795     self._setitem_with_indexer_split_path(indexer, value, name)
   1796 else:
   1797     self._setitem_single_block(indexer, value, name)

File F:\Anaconda\lib\site-packages\pandas\core\indexing.py:1838, in _iLocIndexer._setitem_with_indexer_split_path(self, indexer, value, name)
   1834     self._setitem_with_indexer_2d_value(indexer, value)
   1836 elif len(ilocs) == 1 and lplane_indexer == len(value) and not is_scalar(pi):
   1837     # We are setting multiple rows in a single column.
-> 1838     self._setitem_single_column(ilocs[0], value, pi)
   1840 elif len(ilocs) == 1 and 0 != lplane_indexer != len(value):
   1841     # We are trying to set N values into M entries of a single
   1842     #  column, which is invalid for N != M
   1843     # Exclude zero-len for e.g. boolean masking that is all-false
   1845     if len(value) == 1 and not is_integer(info_axis):
   1846         # This is a case like df.iloc[:3, [1]] = [0]
   1847         #  where we treat as df.iloc[:3, 1] = 0

File F:\Anaconda\lib\site-packages\pandas\core\indexing.py:1992, in _iLocIndexer._setitem_single_column(self, loc, value, plane_indexer)
   1988         value = value[pi]
   1989 else:
   1990     # set value into the column (first attempting to operate inplace, then
   1991     #  falling back to casting if necessary)
-> 1992     self.obj._mgr.column_setitem(loc, plane_indexer, value)
   1993     self.obj._clear_item_cache()
   1994     return

File F:\Anaconda\lib\site-packages\pandas\core\internals\managers.py:1391, in BlockManager.column_setitem(self, loc, idx, value, inplace)
   1389     col_mgr.setitem_inplace(idx, value)
   1390 else:
-> 1391     new_mgr = col_mgr.setitem((idx,), value)
   1392     self.iset(loc, new_mgr._block.values, inplace=True)

File F:\Anaconda\lib\site-packages\pandas\core\internals\managers.py:393, in BaseBlockManager.setitem(self, indexer, value)
    388 if _using_copy_on_write() and not self._has_no_reference(0):
    389     # if being referenced -> perform Copy-on-Write and clear the reference
    390     # this method is only called if there is a single block -> hardcoded 0
    391     self = self.copy()
--> 393 return self.apply("setitem", indexer=indexer, value=value)

File F:\Anaconda\lib\site-packages\pandas\core\internals\managers.py:352, in BaseBlockManager.apply(self, f, align_keys, ignore_failures, **kwargs)
    350         applied = b.apply(f, **kwargs)
    351     else:
--> 352         applied = getattr(b, f)(**kwargs)
    353 except (TypeError, NotImplementedError):
    354     if not ignore_failures:

File F:\Anaconda\lib\site-packages\pandas\core\internals\blocks.py:1417, in EABackedBlock.setitem(self, indexer, value)
   1414 check_setitem_lengths(indexer, value, values)
   1416 try:
-> 1417     values[indexer] = value
   1418 except (ValueError, TypeError) as err:
   1419     _catch_deprecated_value_error(err)

File F:\Anaconda\lib\site-packages\pandas\core\arrays\_mixins.py:266, in NDArrayBackedExtensionArray.__setitem__(self, key, value)
    264 def __setitem__(self, key, value) -> None:
    265     key = check_array_indexer(self, key)
--> 266     value = self._validate_setitem_value(value)
    267     self._ndarray[key] = value

File F:\Anaconda\lib\site-packages\pandas\core\arrays\categorical.py:1558, in Categorical._validate_setitem_value(self, value)
   1555 def _validate_setitem_value(self, value):
   1556     if not is_hashable(value):
   1557         # wrap scalars and hashable-listlikes in list
-> 1558         return self._validate_listlike(value)
   1559     else:
   1560         return self._validate_scalar(value)

File F:\Anaconda\lib\site-packages\pandas\core\arrays\categorical.py:2228, in Categorical._validate_listlike(self, value)
   2226 if isinstance(value, Categorical):
   2227     if not is_dtype_equal(self.dtype, value.dtype):
-> 2228         raise TypeError(
   2229             "Cannot set a Categorical with another, "
   2230             "without identical categories"
   2231         )
   2232     # is_dtype_equal implies categories_match_up_to_permutation
   2233     value = self._encode_with_my_categories(value)

TypeError: Cannot set a Categorical with another, without identical categories

你期待的结果是什么？实际看到的错误信息又是什么？

我期望type=1的数据，能够分两组，[0,6)区间的‘group’字段为3，[6,20）区间的‘group’字段为2.而tpye=2的数据，能够分三组，也满足cut()的结果。

上述代码中，虽然用.loc函数对不同分组进行了筛选。但第一次cut分组成功后（即第16行代码是运行成功的），第二次cut分组会失败（即第17行代码会报错），df的group字段，似乎已经定死了方式，只能用相同类型和相同长度的labels进行分组。字段类型为category，我怀疑这个才是不能对同一字段进行不同分组依据切割的原因。

1个回答

test

2024-07-03

df['group'] = df['group'].astype('object')
df.loc[df['type'] == 2, 'group'] = pd.cut(df.loc[df['type'] == 2, 'value'], bins=bins2, labels=labels2)

适合作为回答的

经过验证的有效解决办法
自己的经验指引，对解决问题有帮助
遵循 Markdown 语法排版，代码语义正确

不该作为回答的

询问内容细节或回复楼层
与题目无关的内容
“赞”“顶”“同问”“看手册”“解决了没”等毫无意义的内容