User based encoding/convert with its interaction in pandas
First create the final value for each bundle
element using groupby
and cumcount
then pivot your dataframe. Finally reindex it to get all columns:
bundle = [f'b{i}' for i in range(1, 16)]
values = df.sort_values('timestamp').groupby('user_iD').cumcount().add(1)
out = (
df.assign(value=values).pivot_table('value', 'user_iD', 'bundle_id', fill_value=0)
.reindex(bundle, axis=1, fill_value=0)
)
Output:
>>> out
bundle_id b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12 b13 b14 b15
user_iD
1 1 2 0 0 3 0 0 0 0 0 0 0 0 0 0
2 1 0 0 0 0 0 2 0 0 4 0 0 0 0 3
3 0 0 1 0 0 0 0 0 0 2 0 3 0 0 0
104 1 0 0 2 0 3 0 0 0 0 0 0 0 0 0
>>> out.reset_index().rename_axis(columns=None)
user_iD b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12 b13 b14 b15
0 1 1 2 0 0 3 0 0 0 0 0 0 0 0 0 0
1 2 1 0 0 0 0 0 2 0 0 4 0 0 0 0 3
2 3 0 0 1 0 0 0 0 0 0 2 0 3 0 0 0
3 104 1 0 0 2 0 3 0 0 0 0 0 0 0 0 0
Lacking more Pythonish experience, I'm proposing the following (partially commented) code snippet which is not optimized in any way, being based merely on elementary pandas.DataFrame
API reference.
import pandas as pd
import io
import sys
data_string = '''
user_iD;question_id;user_answer;correct_answer;correct;elapsed_time;solving_id;bundle_id;timestamp
1;1;A;A;1.0;5.00;1;b1;1547794902000
1;2;D;D;1.0;3.00;2;b2;1547795130000
1;5;C;C;1.0;7.00;5;b5;1547795370000
2;10;C;C;1.0;5.00;10;b10;1547806170000
2;1;B;B;1.0;15.0;1;b1;1547802150000
2;15;A;A;1.0;2.00;15;b15;1547803230000
2;7;C;C;1.0;5.00;7;b7;1547802730000
3;12;A;A;1.0;1.00;25;b12;1547771110000
3;10;C;C;1.0;2.00;10;b10;1547770810000
3;3;D;D;1.0;5.00;3;b3;1547770390000
104;6;C;C;1.0;6.00;6;b6;1553040610000
104;4;A;A;1.0;5.00;4;b4;1553040547000
104;1;A;A;1.0;2.00;1;b1;1553040285000
'''
df = pd.read_csv( io.StringIO(data_string), sep=";", encoding='utf-8')
# get only necessary columns ordered by timestamp
df_aux = df[['user_iD','bundle_id','correct', 'timestamp']].sort_values(by=['timestamp'])
# hard coded new headers (possible to build from real 'bundle_id's)
df_new_headers = ['b{}'.format(x+1) for x in range(15)]
df_new_headers.insert(0, 'user_iD')
dict_answered = {}
# create a new dataframe (I'm sure that there is a more Pythonish solution)
df_new_data = []
user_ids = sorted(set( [x for label, x in df_aux.user_iD.items()]))
for user_id in user_ids:
dict_answered[user_id] = 0
if len( sys.argv) > 1 and sys.argv[1]:
# supplied arg in the next line for better result readability
df_new_values = [sys.argv[1].strip('"').strip("'")
for x in range(len(df_new_headers)-1)]
else:
# zeroes (original assignment)
df_new_values = [0 for x in range(len(df_new_headers)-1)]
df_new_values.insert(0, user_id)
df_new_data.append(df_new_values)
df_new = pd.DataFrame(data=df_new_data, columns=df_new_headers)
# fill the new dataframe using values from the original one
for aux in df_aux.itertuples(index=True, name=None):
if aux[3] == 1.0:
# add 1 to number of already answered questions for current user
dict_answered[aux[1]] += 1
df_new.loc[ df_new["user_iD"] == aux[1], aux[2]] = dict_answered[aux[1]]
print( df_new)
Output examples
Example: .\SO\70751715.py
user_iD b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12 b13 b14 b15
0 1 1 2 0 0 3 0 0 0 0 0 0 0 0 0 0
1 2 1 0 0 0 0 0 2 0 0 4 0 0 0 0 3
2 3 0 0 1 0 0 0 0 0 0 2 0 3 0 0 0
3 104 1 0 0 2 0 3 0 0 0 0 0 0 0 0 0
Example: .\SO\70751715.py .
user_iD b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12 b13 b14 b15
0 1 1 2 . . 3 . . . . . . . . . .
1 2 1 . . . . . 2 . . 4 . . . . 3
2 3 . . 1 . . . . . . 2 . 3 . . .
3 104 1 . . 2 . 3 . . . . . . . . .
Example: .\SO\70751715.py ''
user_iD b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12 b13 b14 b15
0 1 1 2 3
1 2 1 2 4 3
2 3 1 2 3
3 104 1 2 3