我有以下数据集. https://drive.google.com/drive/folders/1NRelNsXQJ7MTNKcm-T69N6r5ZsOyFmTS?usp=sharing 如果列名与表名相同,则将所有内容合并为一个单独的列,以下是代码 import pandas as pdimport globimport os#fi
          https://drive.google.com/drive/folders/1NRelNsXQJ7MTNKcm-T69N6r5ZsOyFmTS?usp=sharing
如果列名与表名相同,则将所有内容合并为一个单独的列,以下是代码
import pandas as pd
import glob
import os
#file directory that contains the csv files
files = glob.glob('/Users/user/Desktop/demo/*.csv')
dfs = [pd.read_csv(fp).assign(SheetName=os.path.basename(fp).split('.')[0]) for fp in files]
data = pd.concat(dfs, ignore_index=True)
data.columns = data.columns.str.lower()
data=data.rename(columns={'sheetname':'Source'})
merged_data = data 
 运行上面的代码后的数据
merged_data
id user product price[78] price[79] Source 105 dummya egg 22 28.0 sheet1 119 dummy1 soya 67 NaN sheet1 567 dummya spinach 22 28.0 sheet2 897 dummy1 rose 67 99.0 sheet2 345 dummya egg 87 98.0 sheet3 121 dummy1 potato 98 99.0 sheet3 
如何在条件下合并文件?
条件.
Sheet   ID  price1_col1 price1_col2 price1         price2_col1 price2_col2 price2                      sheetname
sheet1  yes     78                   price1_col1     78                    price2_col1                  yes
sheet2  yes     78        79         price1_col1+    78         79         price2_col1+                 yes
                                         price1_col2                           price2_col2
sheet3  yes     78        79         max(price1_col1, 79        78         min(price2_col1,price2_col2) no 
                                         price1_col2) 
 上述代码段中的价格1指向sheet1,其列名包含int 78.
如果78 79表示对这些列求和,并将名称作为price1.
产量
id product price1 price2 sheetname 105 egg 22 28 sheet1 119 soya 67 sheet1 567 spinach 50 28 sheet2 897 rose 166 99 sheet2 345 egg 98 87 121 potato 99 98使用:
print (merged_data)
    id    user  product  price[78]  price[79]  Source
0  105  dummya      egg         22       28.0  sheet1
1  119  dummy1     soya         67        NaN  sheet1
2  567  dummya  spinach         22       28.0  sheet2
3  897  dummy1     rose         67       99.0  sheet2
4  345  dummya      egg         87       98.0  sheet3
5  121  dummy1   potato         98       99.0  sheet3
print (Condition)
    Sheet   ID  price1_col1  price1_col2                    price1_out  \
0  sheet1  yes           78          NaN                   price1_col1   
1  sheet2  yes           78         79.0       price1_col1+price1_col2   
2  sheet3  yes           78         79.0  max(price1_col1,price1_col2)   
   price2_col1  price2_col2                    price2_out sheetname  
0           78          NaN                   price2_col1       yes  
1           78         79.0       price2_col1+price2_col2       yes  
2           79         78.0  min(price2_col1,price2_col2)        no 
 #merge data together by left join    
df = merged_data.merge(Condition.rename(columns={'Sheet':'Source'}), on='Source', how='left')
#replace columns to empty strings, remove sheetname and ID columns
df['Source'] = np.where(df.pop('sheetname') == 'yes', df['Source'], '')
df['id'] = np.where(df.pop('ID') == 'yes', df['id'], '')
#filter integers between [] to ned DataFrame 
df1 = df.filter(regex='\[\d+\]').copy()
#filter all columns with price, exclude df1 
df2 = df[df.filter(regex='price').columns.difference(df1.columns)].copy()
#convert column to integers
df1.columns = df1.columns.str.extract('\[(\d+)\]', expand=False).astype(int)
#helper column for match missing values
df1['a'] = np.nan
#filter columns without/with _out
mask = df2.columns.str.endswith(('_col1','_col2'))
final_cols = df2.columns[ ~mask]
removed_cols = df2.columns[mask]
#replace columns by match values from df2
for c in removed_cols:
    df2[c] = df1.lookup(df1.index, df2[c].fillna('a')) 
 print (df2) price1_col1 price1_col2 price1_out price2_col1 \ 0 22 NaN price1_col1 22.0 1 67 NaN price1_col1 67.0 2 22 28.0 price1_col1+price1_col2 22.0 3 67 99.0 price1_col1+price1_col2 67.0 4 87 98.0 max(price1_col1,price1_col2) 98.0 5 98 99.0 max(price1_col1,price1_col2) 99.0 price2_col2 price2_out 0 NaN price2_col1 1 NaN price2_col1 2 28.0 price2_col1+price2_col2 3 99.0 price2_col1+price2_col2 4 87.0 min(price2_col1,price2_col2) 5 98.0 min(price2_col1,price2_col2)
#create MultiIndex for separate eah price groups
df2.columns = df2.columns.str.split('_', expand=True)
def f(x):
    #remove first level
    x.columns = x.columns.droplevel(0)
    out = []
    #loop each row
    for v in x.itertuples(index=False):
        #remove prefix
        t = v.out.replace(x.name+'_', '')
        #loop each namedtuple and replace values
        for k1, v1 in v._asdict().items():
            t = t.replace(k1, str(v1))
        #pd.eval cannot working with min, max, so handled different
        if t.startswith('min'):
            out.append(min(pd.eval(t[3:])))
        elif t.startswith('max'):
            out.append(max(pd.eval(t[3:])))
        #handled +-*/
        else:
            out.append(pd.eval(t))
    #return back
    return pd.Series(out)
#overwrite original columns
df[final_cols] = df2.groupby(level=0, axis=1).apply(f).add_suffix('_out')
#if necessary remove helpers
df = df.drop(removed_cols, axis=1) 
 print (df)
    id    user  product  price[78]  price[79]  Source  price1_out  price2_out
0  105  dummya      egg         22       28.0  sheet1        22.0        22.0
1  119  dummy1     soya         67        NaN  sheet1        67.0        67.0
2  567  dummya  spinach         22       28.0  sheet2        50.0        50.0
3  897  dummy1     rose         67       99.0  sheet2       166.0       166.0
4  345  dummya      egg         87       98.0                98.0        87.0
5  121  dummy1   potato         98       99.0                99.0        98.0
        
             