Find date range overlap in python
Solution 1:
You could just shift the to
column and perform a direct subtraction of the datetimes.
df['overlap'] = (df['to'].shift()-df['from']) > timedelta(0)
Applying this while grouping by id
may look like
df['overlap'] = (df.groupby('id')
.apply(lambda x: (x['to'].shift() - x['from']) > timedelta(0))
.reset_index(level=0, drop=True))
Demo
>>> df
id from to
0 878 2006-01-01 2007-10-01
1 878 2007-10-02 2008-12-01
2 878 2008-12-02 2010-04-03
3 879 2010-04-04 2199-05-11
4 879 2016-05-12 2199-12-31
>>> df['overlap'] = (df.groupby('id')
.apply(lambda x: (x['to'].shift() - x['from']) > timedelta(0))
.reset_index(level=0, drop=True))
>>> df
id from to overlap
0 878 2006-01-01 2007-10-01 False
1 878 2007-10-02 2008-12-01 False
2 878 2008-12-02 2010-04-03 False
3 879 2010-04-04 2199-05-11 False
4 879 2016-05-12 2199-12-31 True
Solution 2:
Another solution. This could be rewritten to leverage Interval.overlaps in pandas 24 and later.
def overlapping_groups(group):
if len(group) > 1:
for index, row in group.iterrows():
for index2, row2 in group.drop(index).iterrows():
int1 = pd.Interval(row2['start_date'],row2['end_date'], closed = 'both')
if row['start_date'] in int1:
return row['id']
if row['end_date'] in int1:
return row['id']
gcols = ['id']
group_output = df.groupby(gcols,group_keys=False).apply(overlapping_groups)
ids_with_overlap = set(group_output[~group_output.isnull()].reset_index(drop = True))
df[df['id'].isin(ids_with_overlap)]
Solution 3:
You can compare the 'from' time with the previous 'to' time:
df['to'].shift() > df['from']
Output:
0 False
1 False
2 False
3 False
4 True
Solution 4:
You can sort the from
column and then simply check if it overlaps with a previous to
column or not using rolling apply function which is very efficient.
df['from'] = pd.DatetimeIndex(df['from']).astype(np.int64)
df['to'] = pd.DatetimeIndex(df['to']).astype(np.int64)
sdf = df.sort_values(by='from')
sdf[["from", "to"]].stack().rolling(window=2).apply(lambda r: 1 if r[1] >= r[0] else 0).unstack()
Now the overlapping periods are the ones with from=0.0
from to
0 NaN 1.0
1 1.0 1.0
2 1.0 1.0
3 1.0 1.0
4 0.0 1.0
Solution 5:
Since I ran into a similar issue like yours, I have been browsing quite extensively. I ran into this solution
this solution.
It uses the function overlaps
from pandas, which is documented in detail here:
here.
def function(df):
timeintervals = pd.IntervalIndex.from_arrays(df.from,df.to,closed='both')
index = np.arange(timeintervals.size)
index_to_keep=[]
for intervals in timeintervals:
index_to_keep.append([0])
control = timeintervals[index].overlaps(timeintervals[index[0]])
if control.any():
index = index[~control]
else:
break
if index.size==0:
break
temp = df.index[index_to_keep]
output = df.loc[temp]
return output