Python Pandas:合并不均匀填充的数据框

4

你好,

我们的客户从我们这里获取的CSV导出文件看起来像这样:

id  |  name  |  object_a  |  amount_a  |  object_b  |  amount_b  |  object_c  |  amount_c
1      abc      object_1     12           none         none         none         none


id  |  name  |  object_a  |  amount_a  |  object_b  |  amount_b  |  object_c  |  amount_c
2      def      object_2     7            object_3     19           none         none


id  |  name  |  object_a  |  amount_a  |  object_b  |  amount_b  |  object_c  |  amount_c
3      ghi      object_4     25           none         none         none         none

我现在只关心对象名称和数量这一对数据。在每组数据中,这种数据对的最大数量始终相同,但它们是随机填充的。 我的问题是:是否可以将它们全部加载到数据框中,并将它们转换为像这样的格式:

object   |   amount
object_1     12
object_2     7
object_3     19
object_4     25

把所有这些csv导出文件加载到一个单独的数据框中并不是问题,但是Pandas是否包含这种问题的解决方案?

感谢您的帮助!

4个回答

2

首先将所有的csv文件进行合并,然后使用pd.wide_to_long函数:

csv_paths = ["your_csv_paths..."]

df = pd.concat([pd.read_csv(i) for i in csv_paths]).replace("none", np.NaN)

print (pd.wide_to_long(df, stubnames=["object","amount"],
                       i=["id","name"],j="Hi", suffix="\w*",
                       sep="_").dropna())

              object amount
id name Hi                 
1  abc  a   object_1     12
2  def  a   object_2      7
        b   object_3     19
3  ghi  a   object_4     25

0

parser.py

import pandas as pd

df = pd.read_csv('test.csv')
fields = (
    ('object_a', 'amount_a'),
    ('object_b', 'amount_b'),
    ('object_c', 'amount_c')
)
print(df, '\n')

newDf = pd.DataFrame(columns=('object', 'amount'))
for idx, row in df.iterrows():
    for fieldName, fieldValue in fields:
        if row[fieldName] != 'none':
            newDf.loc[len(newDf)] = (row[fieldName], row[fieldValue])

print(newDf, '\n')

test.csv

id,name,object_a,amount_a,object_b,amount_b,object_c,amount_c
1,abc,object_1,12,none,none,none,none
1,abc,object_2,15,object_3,42,none,none
1,abc,none,none,none,none,object_4,16

输出

   id name  object_a amount_a  object_b amount_b  object_c amount_c
0   1  abc  object_1       12      none     none      none     none
1   1  abc  object_2       15  object_3       42      none     none
2   1  abc      none     none      none     none  object_4       16

     object amount
0  object_1     12
1  object_2     15
2  object_3     42
3  object_4     16

0

这也许不是最好的方法,但如果所有的 .cvs 文件只包含一行,你可以使用以下方式:

import pandas as pd

def append_df(df, result_df):

    for column in df.columns:
        if column.startswith('object_'):
            print(df[column].values)
            if df[column].values[0] != 'none':
                suffix = column.replace('object_','')
                amount_col='amount_'+suffix

                object_name = df[column].values [0]
                amunt_value=df[amount_col].values [0]

                data_to_append={'object':object_name,'amount':amunt_value}
                result_df=result_df.append(data_to_append, ignore_index=True)

    return result_df

result_df=pd.DataFrame()

data={'id':[1], 'name':['abc'],'object_a':['Obj1'], 'amount_a':[17],'object_b':['none'], 'amount_b':['none'],'object_c':['none'], 'amount_c':['none'] }
df = pd.DataFrame(data)
result_df=append_df(df,result_df)

data={'id':[2], 'name':['def'],'object_a':['Obj2'], 'amount_a':[24],'object_b':['Obj3'], 'amount_b':[18],'object_c':['none'], 'amount_c':['none'] }
df = pd.DataFrame(data)
result_df=append_df(df,result_df)

data={'id':[3], 'name':['ghi'],'object_a':['Obj4'], 'amount_a':[40],'object_b':['none'], 'amount_b':['none'],'object_c':['Obj5'], 'amount_c':[70] }
df = pd.DataFrame(data)
result_df=append_df(df,result_df)

#reoder columns
result_df = result_df[['object','amount']]
print(result_df)

结果:

  object  amount
0   Obj1    17.0
1   Obj2    24.0
2   Obj3    18.0
3   Obj4    40.0
4   Obj5    70.0

0

这里有一种方法,使用pd.read_fwf()来读取定宽文件。分隔符位置是通过程序自动找到的。@HenryYik提供的wide_to_long()也被使用了。

# original data
from io import StringIO
import pandas as pd

data = '''id  |  name  |  object_a  |  amount_a  |  object_b  |  amount_b  |  object_c  |  amount_c
1      abc      object_1     12           none         none         none         none
id  |  name  |  object_a  |  amount_a  |  object_b  |  amount_b  |  object_c  |  amount_c
2      def      object_2     7            object_3     19           none         none
id  |  name  |  object_a  |  amount_a  |  object_b  |  amount_b  |  object_c  |  amount_c
3      ghi      object_4     25           none         none         none         none
'''

# get location of delimiters '|' from first line of file
first_line = next(StringIO(data)).rstrip('\n')
delimiter_pos = (
    [-1] +  # we will add 1 to this, to get 'real' starting location
    [idx for idx, c in enumerate(first_line) if c == '|'] + 
    [len(first_line)])

# convert delimiter positions to start/end positions for each field
#   zip() terminates with the shortest sequence is exhausted
colspecs = [ (start + 1, end) 
            for start, end in zip(delimiter_pos, delimiter_pos[1:])]

# import fixed width file
df = pd.read_fwf(StringIO(data), colspecs=colspecs)

# drop repeated header rows
df = df[ df['id'] != df.columns[0] ]

# convert wide to long
df = pd.wide_to_long(
    df, stubnames=['object', 'amount'],
    i = ['id', 'name'], j = 'group',
    suffix='\w*', sep='_',).reset_index()

# drop rows with no info
mask = (df['object'] != 'none') & (df['amount'] != 'none')
t = df.loc[mask, ['object', 'amount']].set_index('object')
print(t)

         amount
object         
object_1     12
object_2      7
object_3     19
object_4     25

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接