根据您提供的数据框:
import pandas as pd
df = pd.DataFrame(
{
"id": [1, 2],
"date": ["1/14/2021", "5/16/2020"],
"gender": ["M", "F"],
"response": [
"{'score':3,'reason':{'description':array(['a','b','c'])}",
"{'score':4,'reason':{'description':array(['x','y','z'])}",
],
}
)
你可以定义一个函数来展开
response
列中的值:
def flatten(data, new_data):
"""Recursive helper function.
Args:
data: nested dictionary.
new_data: empty dictionary.
Returns:
Flattened dictionary.
"""
for key, value in data.items():
if isinstance(value, list):
for item in value:
flatten(item, new_data)
if isinstance(value, dict):
flatten(value, new_data)
if (
isinstance(value, str)
or isinstance(value, int)
or isinstance(value, ndarray)
):
new_data[key] = value
return new_data
然后,使用Numpy
ndarrays来处理数组,并使用Python标准库
eval内置函数将
response
列中的字符串转换为字典,按照以下步骤进行:
import numpy as np
from numpy import ndarray
df["response"] = df["response"].apply(
lambda x: flatten(eval(x.replace("array", "np.array") + "}"), {})
)
new_df = pd.concat(
[
pd.concat(
[
pd.DataFrame(df.loc[idx, :]).T.drop(columns="response"),
pd.DataFrame(df.loc[idx, "response"]).reset_index(drop=True),
],
axis=1,
).fillna(method="ffill")
for idx in df.index
]
).reset_index(drop=True)
所以:
print(new_df)
id date gender score description
0 1 1/14/2021 M 3 a
1 1 1/14/2021 M 3 b
2 1 1/14/2021 M 3 c
3 2 5/16/2020 F 4 y
4 2 5/16/2020 F 4 x
5 2 5/16/2020 F 4 z
pd.json_normalize
来展开它。 - NoobVB