pandas如何根据某列不同值,对其他列用不同的bins和labels进行cut函数切割?
问题描述
有一个DataFrame,我根据某一列‘type’的不同,对‘value’一列进行cut切割分组,赋予不同的分组值。但如果bins和labels数值或者长度不一致,就无法在同一个df里继续操作。
问题出现的环境背景及自己尝试过哪些方法
我尝试用apply的方式去解决,但并没有用好cut函数。
相关代码
import pandas as pd
# 创建示例数据
df = pd.DataFrame({
'value': [5, 10, 15, 20, 25, 30],
'type': [1, 1, 1, 2, 2, 2]
})
# 定义两个bins
bins1 = [0, 6, 20]
labels1=[3,2]
bins2 = [0, 20, 22, 50]
labels2=[3,2,1]
# 对type=1的数据使用bins1进行cut,对type=2的数据使用bins2进行cut
df.loc[df['type'] == 1, 'group'] = pd.cut(df.loc[df['type'] == 1, 'value'], bins=bins1, labels=labels1)
df.loc[df['type'] == 2, 'group'] = pd.cut(df.loc[df['type'] == 2, 'value'], bins=bins2, labels=labels2)
# 输出结果
print(df)
报错信息如下:
TypeError Traceback (most recent call last)
Cell In[111], line 17
15 # 对type=1的数据使用bins1进行cut,对type=2的数据使用bins2进行cut
16 df.loc[df['type'] == 1, 'group'] = pd.cut(df.loc[df['type'] == 1, 'value'], bins=bins1, labels=labels1)
---> 17 df.loc[df['type'] == 2, 'group'] = pd.cut(df.loc[df['type'] == 2, 'value'], bins=bins2, labels=labels2)
19 # 输出结果
20 print(df)
File F:\Anaconda\lib\site-packages\pandas\core\indexing.py:818, in _LocationIndexer.__setitem__(self, key, value)
815 self._has_valid_setitem_indexer(key)
817 iloc = self if self.name == "iloc" else self.obj.iloc
--> 818 iloc._setitem_with_indexer(indexer, value, self.name)
File F:\Anaconda\lib\site-packages\pandas\core\indexing.py:1795, in _iLocIndexer._setitem_with_indexer(self, indexer, value, name)
1792 # align and set the values
1793 if take_split_path:
1794 # We have to operate column-wise
-> 1795 self._setitem_with_indexer_split_path(indexer, value, name)
1796 else:
1797 self._setitem_single_block(indexer, value, name)
File F:\Anaconda\lib\site-packages\pandas\core\indexing.py:1838, in _iLocIndexer._setitem_with_indexer_split_path(self, indexer, value, name)
1834 self._setitem_with_indexer_2d_value(indexer, value)
1836 elif len(ilocs) == 1 and lplane_indexer == len(value) and not is_scalar(pi):
1837 # We are setting multiple rows in a single column.
-> 1838 self._setitem_single_column(ilocs[0], value, pi)
1840 elif len(ilocs) == 1 and 0 != lplane_indexer != len(value):
1841 # We are trying to set N values into M entries of a single
1842 # column, which is invalid for N != M
1843 # Exclude zero-len for e.g. boolean masking that is all-false
1845 if len(value) == 1 and not is_integer(info_axis):
1846 # This is a case like df.iloc[:3, [1]] = [0]
1847 # where we treat as df.iloc[:3, 1] = 0
File F:\Anaconda\lib\site-packages\pandas\core\indexing.py:1992, in _iLocIndexer._setitem_single_column(self, loc, value, plane_indexer)
1988 value = value[pi]
1989 else:
1990 # set value into the column (first attempting to operate inplace, then
1991 # falling back to casting if necessary)
-> 1992 self.obj._mgr.column_setitem(loc, plane_indexer, value)
1993 self.obj._clear_item_cache()
1994 return
File F:\Anaconda\lib\site-packages\pandas\core\internals\managers.py:1391, in BlockManager.column_setitem(self, loc, idx, value, inplace)
1389 col_mgr.setitem_inplace(idx, value)
1390 else:
-> 1391 new_mgr = col_mgr.setitem((idx,), value)
1392 self.iset(loc, new_mgr._block.values, inplace=True)
File F:\Anaconda\lib\site-packages\pandas\core\internals\managers.py:393, in BaseBlockManager.setitem(self, indexer, value)
388 if _using_copy_on_write() and not self._has_no_reference(0):
389 # if being referenced -> perform Copy-on-Write and clear the reference
390 # this method is only called if there is a single block -> hardcoded 0
391 self = self.copy()
--> 393 return self.apply("setitem", indexer=indexer, value=value)
File F:\Anaconda\lib\site-packages\pandas\core\internals\managers.py:352, in BaseBlockManager.apply(self, f, align_keys, ignore_failures, **kwargs)
350 applied = b.apply(f, **kwargs)
351 else:
--> 352 applied = getattr(b, f)(**kwargs)
353 except (TypeError, NotImplementedError):
354 if not ignore_failures:
File F:\Anaconda\lib\site-packages\pandas\core\internals\blocks.py:1417, in EABackedBlock.setitem(self, indexer, value)
1414 check_setitem_lengths(indexer, value, values)
1416 try:
-> 1417 values[indexer] = value
1418 except (ValueError, TypeError) as err:
1419 _catch_deprecated_value_error(err)
File F:\Anaconda\lib\site-packages\pandas\core\arrays\_mixins.py:266, in NDArrayBackedExtensionArray.__setitem__(self, key, value)
264 def __setitem__(self, key, value) -> None:
265 key = check_array_indexer(self, key)
--> 266 value = self._validate_setitem_value(value)
267 self._ndarray[key] = value
File F:\Anaconda\lib\site-packages\pandas\core\arrays\categorical.py:1558, in Categorical._validate_setitem_value(self, value)
1555 def _validate_setitem_value(self, value):
1556 if not is_hashable(value):
1557 # wrap scalars and hashable-listlikes in list
-> 1558 return self._validate_listlike(value)
1559 else:
1560 return self._validate_scalar(value)
File F:\Anaconda\lib\site-packages\pandas\core\arrays\categorical.py:2228, in Categorical._validate_listlike(self, value)
2226 if isinstance(value, Categorical):
2227 if not is_dtype_equal(self.dtype, value.dtype):
-> 2228 raise TypeError(
2229 "Cannot set a Categorical with another, "
2230 "without identical categories"
2231 )
2232 # is_dtype_equal implies categories_match_up_to_permutation
2233 value = self._encode_with_my_categories(value)
TypeError: Cannot set a Categorical with another, without identical categories
你期待的结果是什么?实际看到的错误信息又是什么?
我期望type=1的数据,能够分两组,[0,6)区间的‘group’字段为3,[6,20)区间的‘group’字段为2.而tpye=2的数据,能够分三组,也满足cut()的结果。
上述代码中,虽然用.loc函数对不同分组进行了筛选。但第一次cut分组成功后(即第16行代码是运行成功的),第二次cut分组会失败(即第17行代码会报错),df的group字段,似乎已经定死了方式,只能用相同类型和相同长度的labels进行分组。字段类型为category,我怀疑这个才是不能对同一字段进行不同分组依据切割的原因。
回复
1个回答
test
2024-07-03
df['group'] = df['group'].astype('object')
df.loc[df['type'] == 2, 'group'] = pd.cut(df.loc[df['type'] == 2, 'value'], bins=bins2, labels=labels2)
回复
适合作为回答的
- 经过验证的有效解决办法
- 自己的经验指引,对解决问题有帮助
- 遵循 Markdown 语法排版,代码语义正确
不该作为回答的
- 询问内容细节或回复楼层
- 与题目无关的内容
- “赞”“顶”“同问”“看手册”“解决了没”等毫无意义的内容