Python Pandas User Warning: Sorting because non-concatenation axis is not aligned
I'm doing some code practice and applying merging of data frames while doing this getting user warning
/usr/lib64/python2.7/site-packages/pandas/core/frame.py:6201: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. To accept the future behavior, pass 'sort=True'. To retain the current behavior and silence the warning, pass sort=False
On these lines of code: Can you please help to get the solution of this warning.
placement_video = [self.read_sql_vdx_summary, self.read_sql_video_km]
placement_video_summary = reduce(lambda left, right: pd.merge(left, right, on='PLACEMENT', sort=False), placement_video)
placement_by_video = placement_video_summary.loc[:, ["PLACEMENT", "PLACEMENT_NAME", "COST_TYPE", "PRODUCT",
"VIDEONAME", "VIEW0", "VIEW25", "VIEW50", "VIEW75",
"VIEW100",
"ENG0", "ENG25", "ENG50", "ENG75", "ENG100", "DPE0",
"DPE25",
"DPE50", "DPE75", "DPE100"]]
# print (placement_by_video)
placement_by_video["Placement# Name"] = placement_by_video[["PLACEMENT",
"PLACEMENT_NAME"]].apply(lambda x: ".".join(x),
axis=1)
placement_by_video_new = placement_by_video.loc[:,
["PLACEMENT", "Placement# Name", "COST_TYPE", "PRODUCT", "VIDEONAME",
"VIEW0", "VIEW25", "VIEW50", "VIEW75", "VIEW100",
"ENG0", "ENG25", "ENG50", "ENG75", "ENG100", "DPE0", "DPE25",
"DPE50", "DPE75", "DPE100"]]
placement_by_km_video = [placement_by_video_new, self.read_sql_km_for_video]
placement_by_km_video_summary = reduce(lambda left, right: pd.merge(left, right, on=['PLACEMENT', 'PRODUCT'], sort=False),
placement_by_km_video)
#print (list(placement_by_km_video_summary))
#print(placement_by_km_video_summary)
#exit()
# print(placement_by_video_new)
"""Conditions for 25%view"""
mask17 = placement_by_km_video_summary["PRODUCT"].isin(['Display', 'Mobile'])
mask18 = placement_by_km_video_summary["COST_TYPE"].isin(["CPE", "CPM", "CPCV"])
mask19 = placement_by_km_video_summary["PRODUCT"].isin(["InStream"])
mask20 = placement_by_km_video_summary["COST_TYPE"].isin(["CPE", "CPM", "CPE+", "CPCV"])
mask_video_video_completions = placement_by_km_video_summary["COST_TYPE"].isin(["CPCV"])
mask21 = placement_by_km_video_summary["COST_TYPE"].isin(["CPE+"])
mask22 = placement_by_km_video_summary["COST_TYPE"].isin(["CPE", "CPM"])
mask23 = placement_by_km_video_summary["PRODUCT"].isin(['Display', 'Mobile', 'InStream'])
mask24 = placement_by_km_video_summary["COST_TYPE"].isin(["CPE", "CPM", "CPE+"])
choice25video_eng = placement_by_km_video_summary["ENG25"]
choice25video_vwr = placement_by_km_video_summary["VIEW25"]
choice25video_deep = placement_by_km_video_summary["DPE25"]
placement_by_km_video_summary["25_pc_video"] = np.select([mask17 & mask18, mask19 & mask20, mask17 & mask21],
[choice25video_eng, choice25video_vwr, choice25video_deep])
"""Conditions for 50%view"""
choice50video_eng = placement_by_km_video_summary["ENG50"]
choice50video_vwr = placement_by_km_video_summary["VIEW50"]
choice50video_deep = placement_by_km_video_summary["DPE50"]
placement_by_km_video_summary["50_pc_video"] = np.select([mask17 & mask18, mask19 & mask20, mask17 & mask21],
[choice50video_eng,
choice50video_vwr, choice50video_deep])
"""Conditions for 75%view"""
choice75video_eng = placement_by_km_video_summary["ENG75"]
choice75video_vwr = placement_by_km_video_summary["VIEW75"]
choice75video_deep = placement_by_km_video_summary["DPE75"]
placement_by_km_video_summary["75_pc_video"] = np.select([mask17 & mask18, mask19 & mask20, mask17 & mask21],
[choice75video_eng,
choice75video_vwr,
choice75video_deep])
"""Conditions for 100%view"""
choice100video_eng = placement_by_km_video_summary["ENG100"]
choice100video_vwr = placement_by_km_video_summary["VIEW100"]
choice100video_deep = placement_by_km_video_summary["DPE100"]
choicecompletions = placement_by_km_video_summary['COMPLETIONS']
placement_by_km_video_summary["100_pc_video"] = np.select([mask17 & mask22, mask19 & mask24, mask17 & mask21, mask23 & mask_video_video_completions],
[choice100video_eng, choice100video_vwr, choice100video_deep, choicecompletions])
"""conditions for 0%view"""
choice0video_eng = placement_by_km_video_summary["ENG0"]
choice0video_vwr = placement_by_km_video_summary["VIEW0"]
choice0video_deep = placement_by_km_video_summary["DPE0"]
placement_by_km_video_summary["Views"] = np.select([mask17 & mask18, mask19 & mask20, mask17 & mask21],
[choice0video_eng,
choice0video_vwr,
choice0video_deep])
#print (placement_by_km_video_summary)
#exit()
#final Table
placement_by_video_summary = placement_by_km_video_summary.loc[:,
["PLACEMENT", "Placement# Name", "PRODUCT", "VIDEONAME", "COST_TYPE",
"Views", "25_pc_video", "50_pc_video", "75_pc_video","100_pc_video",
"ENGAGEMENTS","IMPRESSIONS", "DPEENGAMENTS"]]
#placement_by_km_video = [placement_by_video_summary, self.read_sql_km_for_video]
#placement_by_km_video_summary = reduce(lambda left, right: pd.merge(left, right, on=['PLACEMENT', 'PRODUCT']),
#placement_by_km_video)
#print(placement_by_video_summary)
#exit()
# dup_col =["IMPRESSIONS","ENGAGEMENTS","DPEENGAMENTS"]
# placement_by_video_summary.loc[placement_by_video_summary.duplicated(dup_col),dup_col] = np.nan
# print ("Dhar",placement_by_video_summary)
'''adding views based on conditions'''
#filter maximum value from videos
placement_by_video_summary_new = placement_by_km_video_summary.loc[
placement_by_km_video_summary.reset_index().groupby(['PLACEMENT', 'PRODUCT'])['Views'].idxmax()]
#print (placement_by_video_summary_new)
#exit()
# print (placement_by_video_summary_new)
# mask22 = (placement_by_video_summary_new.PRODUCT.str.upper ()=='DISPLAY') & (placement_by_video_summary_new.COST_TYPE=='CPE')
placement_by_video_summary_new.loc[mask17 & mask18, 'Views'] = placement_by_video_summary_new['ENGAGEMENTS']
placement_by_video_summary_new.loc[mask19 & mask20, 'Views'] = placement_by_video_summary_new['IMPRESSIONS']
placement_by_video_summary_new.loc[mask17 & mask21, 'Views'] = placement_by_video_summary_new['DPEENGAMENTS']
#print (placement_by_video_summary_new)
#exit()
placement_by_video_summary = placement_by_video_summary.drop(placement_by_video_summary_new.index).append(
placement_by_video_summary_new).sort_index()
placement_by_video_summary["Video Completion Rate"] = placement_by_video_summary["100_pc_video"] / \
placement_by_video_summary["Views"]
placement_by_video_final = placement_by_video_summary.loc[:,
["Placement# Name", "PRODUCT", "VIDEONAME", "Views",
"25_pc_video", "50_pc_video", "75_pc_video", "100_pc_video",
"Video Completion Rate"]]
tl;dr:
concat
and append
currently sort the non-concatenation index (e.g. columns if you're adding rows) if the columns don't match. In pandas 0.23 this started generating a warning; pass the parameter sort=True
to silence it. In the future the default will change to not sort, so it's best to specify either sort=True
or False
now, or better yet ensure that your non-concatenation indices match.
The warning is new in pandas 0.23.0:
In a future version of pandas pandas.concat()
and DataFrame.append()
will no longer sort the non-concatenation axis when it is not already aligned. The current behavior is the same as the previous (sorting), but now a warning is issued when sort is not specified and the non-concatenation axis is not aligned,
link.
More information from linked very old github issue, comment by smcinerney :
When concat'ing DataFrames, the column names get alphanumerically sorted if there are any differences between them. If they're identical across DataFrames, they don't get sorted.
This sort is undocumented and unwanted. Certainly the default behavior should be no-sort.
After some time the parameter sort
was implemented in pandas.concat
and DataFrame.append
:
sort : boolean, default None
Sort non-concatenation axis if it is not already aligned when join is 'outer'. The current default of sorting is deprecated and will change to not-sorting in a future version of pandas.
Explicitly pass sort=True to silence the warning and sort. Explicitly pass sort=False to silence the warning and not sort.
This has no effect when join='inner', which already preserves the order of the non-concatenation axis.
So if both DataFrames have the same columns in the same order, there is no warning and no sorting:
df1 = pd.DataFrame({"a": [1, 2], "b": [0, 8]}, columns=['a', 'b'])
df2 = pd.DataFrame({"a": [4, 5], "b": [7, 3]}, columns=['a', 'b'])
print (pd.concat([df1, df2]))
a b
0 1 0
1 2 8
0 4 7
1 5 3
df1 = pd.DataFrame({"a": [1, 2], "b": [0, 8]}, columns=['b', 'a'])
df2 = pd.DataFrame({"a": [4, 5], "b": [7, 3]}, columns=['b', 'a'])
print (pd.concat([df1, df2]))
b a
0 0 1
1 8 2
0 7 4
1 3 5
But if the DataFrames have different columns, or the same columns in a different order, pandas returns a warning if no parameter sort
is explicitly set (sort=None
is the default value):
df1 = pd.DataFrame({"a": [1, 2], "b": [0, 8]}, columns=['b', 'a'])
df2 = pd.DataFrame({"a": [4, 5], "b": [7, 3]}, columns=['a', 'b'])
print (pd.concat([df1, df2]))
FutureWarning: Sorting because non-concatenation axis is not aligned.
a b
0 1 0
1 2 8
0 4 7
1 5 3
print (pd.concat([df1, df2], sort=True))
a b
0 1 0
1 2 8
0 4 7
1 5 3
print (pd.concat([df1, df2], sort=False))
b a
0 0 1
1 8 2
0 7 4
1 3 5
If the DataFrames have different columns, but the first columns are aligned - they will be correctly assigned to each other (columns a
and b
from df1
with a
and b
from df2
in the example below) because they exist in both. For other columns that exist in one but not both DataFrames, missing values are created.
Lastly, if you pass sort=True
, columns are sorted alphanumerically. If sort=False
and the second DafaFrame has columns that are not in the first, they are appended to the end with no sorting:
df1 = pd.DataFrame({"a": [1, 2], "b": [0, 8], 'e':[5, 0]},
columns=['b', 'a','e'])
df2 = pd.DataFrame({"a": [4, 5], "b": [7, 3], 'c':[2, 8], 'd':[7, 0]},
columns=['c','b','a','d'])
print (pd.concat([df1, df2]))
FutureWarning: Sorting because non-concatenation axis is not aligned.
a b c d e
0 1 0 NaN NaN 5.0
1 2 8 NaN NaN 0.0
0 4 7 2.0 7.0 NaN
1 5 3 8.0 0.0 NaN
print (pd.concat([df1, df2], sort=True))
a b c d e
0 1 0 NaN NaN 5.0
1 2 8 NaN NaN 0.0
0 4 7 2.0 7.0 NaN
1 5 3 8.0 0.0 NaN
print (pd.concat([df1, df2], sort=False))
b a e c d
0 0 1 5.0 NaN NaN
1 8 2 0.0 NaN NaN
0 7 4 NaN 2.0 7.0
1 3 5 NaN 8.0 0.0
In your code:
placement_by_video_summary = placement_by_video_summary.drop(placement_by_video_summary_new.index)
.append(placement_by_video_summary_new, sort=True)
.sort_index()
jezrael's answer is good, but did not answer a question I had: Will getting the "sort" flag wrong mess up my data in any way? The answer is apparently "no", you are fine either way.
from pandas import DataFrame, concat
a = DataFrame([{'a':1, 'c':2,'d':3 }])
b = DataFrame([{'a':4,'b':5, 'd':6,'e':7}])
>>> concat([a,b],sort=False)
a c d b e
0 1 2.0 3 NaN NaN
0 4 NaN 6 5.0 7.0
>>> concat([a,b],sort=True)
a b c d e
0 1 NaN 2.0 3 NaN
0 4 5.0 NaN 6 7.0