How to merge csv files with different headers but same data on condition
Solution 1:
Use:
print (merged_data)
id user product price[78] price[79] Source
0 105 dummya egg 22 28.0 sheet1
1 119 dummy1 soya 67 NaN sheet1
2 567 dummya spinach 22 28.0 sheet2
3 897 dummy1 rose 67 99.0 sheet2
4 345 dummya egg 87 98.0 sheet3
5 121 dummy1 potato 98 99.0 sheet3
print (Condition)
Sheet ID price1_col1 price1_col2 price1_out \
0 sheet1 yes 78 NaN price1_col1
1 sheet2 yes 78 79.0 price1_col1+price1_col2
2 sheet3 yes 78 79.0 max(price1_col1,price1_col2)
price2_col1 price2_col2 price2_out sheetname
0 78 NaN price2_col1 yes
1 78 79.0 price2_col1+price2_col2 yes
2 79 78.0 min(price2_col1,price2_col2) no
#merge data together by left join
df = merged_data.merge(Condition.rename(columns={'Sheet':'Source'}), on='Source', how='left')
#replace columns to empty strings, remove sheetname and ID columns
df['Source'] = np.where(df.pop('sheetname') == 'yes', df['Source'], '')
df['id'] = np.where(df.pop('ID') == 'yes', df['id'], '')
#filter integers between [] to ned DataFrame
df1 = df.filter(regex='\[\d+\]').copy()
#filter all columns with price, exclude df1
df2 = df[df.filter(regex='price').columns.difference(df1.columns)].copy()
#convert column to integers
df1.columns = df1.columns.str.extract('\[(\d+)\]', expand=False).astype(int)
#helper column for match missing values
df1['a'] = np.nan
#filter columns without/with _out
mask = df2.columns.str.endswith(('_col1','_col2'))
final_cols = df2.columns[ ~mask]
removed_cols = df2.columns[mask]
#replace columns by match values from df2
for c in removed_cols:
df2[c] = df1.lookup(df1.index, df2[c].fillna('a'))
print (df2)
price1_col1 price1_col2 price1_out price2_col1 \
0 22 NaN price1_col1 22.0
1 67 NaN price1_col1 67.0
2 22 28.0 price1_col1+price1_col2 22.0
3 67 99.0 price1_col1+price1_col2 67.0
4 87 98.0 max(price1_col1,price1_col2) 98.0
5 98 99.0 max(price1_col1,price1_col2) 99.0
price2_col2 price2_out
0 NaN price2_col1
1 NaN price2_col1
2 28.0 price2_col1+price2_col2
3 99.0 price2_col1+price2_col2
4 87.0 min(price2_col1,price2_col2)
5 98.0 min(price2_col1,price2_col2)
#create MultiIndex for separate eah price groups
df2.columns = df2.columns.str.split('_', expand=True)
def f(x):
#remove first level
x.columns = x.columns.droplevel(0)
out = []
#loop each row
for v in x.itertuples(index=False):
#remove prefix
t = v.out.replace(x.name+'_', '')
#loop each namedtuple and replace values
for k1, v1 in v._asdict().items():
t = t.replace(k1, str(v1))
#pd.eval cannot working with min, max, so handled different
if t.startswith('min'):
out.append(min(pd.eval(t[3:])))
elif t.startswith('max'):
out.append(max(pd.eval(t[3:])))
#handled +-*/
else:
out.append(pd.eval(t))
#return back
return pd.Series(out)
#overwrite original columns
df[final_cols] = df2.groupby(level=0, axis=1).apply(f).add_suffix('_out')
#if necessary remove helpers
df = df.drop(removed_cols, axis=1)
print (df)
id user product price[78] price[79] Source price1_out price2_out
0 105 dummya egg 22 28.0 sheet1 22.0 22.0
1 119 dummy1 soya 67 NaN sheet1 67.0 67.0
2 567 dummya spinach 22 28.0 sheet2 50.0 50.0
3 897 dummy1 rose 67 99.0 sheet2 166.0 166.0
4 345 dummya egg 87 98.0 98.0 87.0
5 121 dummy1 potato 98 99.0 99.0 98.0