R group rows conditional by rowwise comparisons in a scalable way

Here are two solutions with dplyr and data.table respectively. Each package vectorizes its operations, so these solutions should be far faster than your loop; and the data.table solution should be the fastest of them all.

Let me know how each solution works for you!

Note

To identify the group to which each row belongs, we use the earliest row that it "matches"; where "matching" rows are defined as those that

share the same value in either start<>end, end<>start, start<>start, or end<>end and have a matching value (>0) in the related start_sep and end_sep column.

For a smaller dataset, it would be simple enough to perform a CROSS JOIN and then filter by your criteria. However, for a dataset with over 1 million rows, its CROSS JOIN would easily max out the available memory at over 1 trillion rows, so I had to find a different technique.

To wit, I use paste0() to generate "artificial" keys. Here start and start_sep are combined into start_label, while end and end_sep are combined into end_label. Now we can directly match() on a single column like start_label; rather than sifting every possible match across a set of columns like {start, start_sep}.

This approach assumes that in those * and *_sep columns:

  1. every distinct value can be represented as a distinct string;
  2. the separator "|" is absent from that string.

Solution 1: dplyr

Once you load dplyr

library(dplyr)


# ...
# Code to generate 'df'.
# ...

this workflow should do the trick. Note that group IDs must be calculated before the JOIN; since cur_group_id() would otherwise "misidentify" the NAs as a group unto themselves.

df %>%
  mutate(
    # Create an artificial key for matching.
    start_label = paste0(start, " | ", start_sep),
    end_label   = paste0(end,   " | ", end_sep  ),
    
    # Identify the earliest row where each match is found.
    start_to_start = match(start_label, start_label),
    start_to_end   = match(start_label, end_label  ),
    end_to_start   = match(end_label  , start_label),
    end_to_end     = match(end_label  , end_label  )
  ) %>%
  
  # Include only rows meeting the criteria: remove any...
  filter(
  # ...without a match...
  #                   |-------------------------------------------|
    (start_sep > 0 & !(is.na(start_to_start) & is.na(start_to_end))) |
    (end_sep   > 0 & !(is.na(end_to_start  ) & is.na(end_to_end  )))
  #  |-----------|
  # ...that corresponds to a positive '*_sep'.
  ) %>%
  
  # For each row, identify the earliest of ALL its matches.
  mutate(
    match_id = pmin(
      start_to_start, start_to_end, end_to_start, end_to_end,
      na.rm = TRUE
    )
  ) %>%

  # Keep only the 'id' of each row, along with a 'group_id' for its earliest match.
  group_by(match_id) %>%
  transmute(
    id,
    group_id = cur_group_id()
  ) %>%
  ungroup() %>%
  
  # Map the original rows to their 'group_id's; with blanks (NAs) for no match.
  right_join(df, by = "id") %>%
  
  # Format into final form.
  select(id, start, start_sep, end, end_sep, group_id) %>%
  arrange(id)

Results

Please note that your sample data is inconsistent, so I have reconstructed my own df:

df <- structure(list(
    id = 1:9,
    start = c("A", "B", "D", "D", "E", "F", "A", "O", "A"),
    start_sep = c(1L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 1L),
    end = c("F", "G", "H", "J", "K", "L", "O", "P", "P"),
    end_sep = c(1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L)
  ),
  class = "data.frame",
  row.names = c(NA, -9L)
)

Given said df, the workflow should yield the following tibble:

# A tibble: 9 x 6
     id start start_sep end   end_sep group_id
  <int> <chr>     <int> <chr>   <int>    <int>
1     1 A             1 F           1        1
2     2 B             0 G           0       NA
3     3 D             1 H           0        2
4     4 D             1 J           0        2
5     5 E             0 K           0       NA
6     6 F             1 L           0        1
7     7 A             0 O           1        3
8     8 O             1 P           0        3
9     9 A             1 P           0        1

Solution 2: data.table

Here is essentially the same logic, but implemented in data.table.

library(data.table)


# ...
# Code to generate 'df'.
# ...


# Convert 'df' to a data.table.
df <- as.data.table(df)

Again, note that group IDs must be calculated before the JOIN; since .GRP would otherwise "misidentify" the NAs as a group unto themselves.

# Use 'id' as the key for efficient JOINs.
setkey(df, id

# Calculate the label and matching columns as before.
)[, c("start_label", "end_label") := .(
  paste0(start, " | ", start_sep),
  paste0(end  , " | ", end_sep  )
)][, c("start_to_start", "start_to_end", "end_to_start", "end_to_end") := .(
  match(start_label, start_label),
  match(start_label, end_label  ),
  match(end_label  , start_label),
  match(end_label  , end_label  )

# Filter by criteria as before.
)][
  (start_sep > 0 & !(is.na(start_to_start) & is.na(start_to_end))) |
  (end_sep   > 0 & !(is.na(end_to_start  ) & is.na(end_to_end  )))

# Generate the 'group_id' as before.
,][, .(id, match_id = pmin(
  start_to_start, start_to_end, end_to_start, end_to_end,
  na.rm = TRUE
))][,
  ("group_id") := .GRP,
  by = .(match_id)

# Perform the mapping (RIGHT JOIN) as before...
][
  df,
  # ...and select the desired columns.
  .(id, start, start_sep, end, end_sep, group_id)
]

Results

With df as before, this solution should yield the following data.table:

   id start start_sep end end_sep group_id
1:  1     A         1   F       1        1
2:  2     B         0   G       0       NA
3:  3     D         1   H       0        2
4:  4     D         1   J       0        2
5:  5     E         0   K       0       NA
6:  6     F         1   L       0        1
7:  7     A         0   O       1        3
8:  8     O         1   P       0        3
9:  9     A         1   P       0        1

Performance

At scale, the data.table solution should be proportionately faster than the dplyr solution; but both should be quite fast.

On the massive dataset big_data, a data.frame with over 1 million rows

# Find every combination of variables...
big_df <- expand.grid(
  start = LETTERS,
  start_sep = 0:1,
  end = LETTERS,
  end_sep = 0:1
)

# ...and repeat until there are (at least) 1 million...
n_comb <- nrow(big_df)
n_rep <- ceiling(1000000/n_comb)

# ...with unique IDs.
big_df <- data.frame(
  id = 1:(n_comb * n_rep),
  start     = rep(big_df$start    , n_rep),
  start_sep = rep(big_df$start_sep, n_rep),
  end       = rep(big_df$end      , n_rep),
  end_sep   = rep(big_df$end_sep  , n_rep)
)

we can measure the relative performances of each solution at scale

library(microbenchmark)

performances <- microbenchmark(
  # Repeat test 50 times, for reliability.
  times = 50,
  
  # Solution 1: "dplyr".
  solution_1 = {
    big_df %>%
      mutate(
        start_label = paste0(start, " | ", start_sep),
        end_label   = paste0(end,   " | ", end_sep  ),
        start_to_start = match(start_label, start_label),
        start_to_end   = match(start_label, end_label  ),
        end_to_start   = match(end_label  , start_label),
        end_to_end     = match(end_label  , end_label  )
      ) %>%
      filter(
        (start_sep > 0 & !(is.na(start_to_start) & is.na(start_to_end))) |
        (end_sep   > 0 & !(is.na(end_to_start  ) & is.na(end_to_end  )))
      ) %>%
      mutate(match_id = pmin(
        start_to_start, start_to_end, end_to_start, end_to_end,
        na.rm = TRUE
      )) %>%
      group_by(match_id) %>%
      transmute(id, group_id = cur_group_id()) %>%
      ungroup() %>%
      right_join(big_df, by = "id") %>%
      select(id, start, start_sep, end, end_sep, group_id) %>%
      arrange(id)
  },
  
  # Solution 2: "data.table"
  solution_2 = {
    big_dt <- as.data.table(big_df)
    
    setkey(big_dt, id)[, c("start_label", "end_label") := .(
      paste0(start, " | ", start_sep),
      paste0(end  , " | ", end_sep  )
    )][, c("start_to_start", "start_to_end", "end_to_start", "end_to_end") := .(
      match(start_label, start_label),
      match(start_label, end_label  ),
      match(end_label  , start_label),
      match(end_label  , end_label  )
    )][
      (start_sep > 0 & !(is.na(start_to_start) & is.na(start_to_end))) |
      (end_sep   > 0 & !(is.na(end_to_start  ) & is.na(end_to_end  )))
    ,][, .(id, match_id = pmin(
      start_to_start, start_to_end, end_to_start, end_to_end,
      na.rm = TRUE
    ))][, ("group_id") := .GRP, by = .(match_id)
    ][big_dt, .(id, start, start_sep, end, end_sep, group_id)]
  }
)

which I have tabulated here:

#> performances

Unit: milliseconds
       expr      min       lq      mean   median        uq       max neval
 solution_1 880.1443 972.9289 1013.2868 997.5746 1059.9192 1186.8743    50
 solution_2 581.2570 606.7222  649.9858 650.2422  679.4404  734.3966    50

By converting from time to speed

library(formattable)

performances %>%
  as_tibble() %>%
  group_by(expr) %>%
  summarize(t_mean = mean(time)) %>%
  transmute(
    solution = expr,
    # Invert time to get speed; and normalize % by longest time.
    advantage = percent(max(t_mean)/t_mean - 1)
  )

we estimate that the data.table solution is (on average) about 50% faster than the dplyr solution.

# A tibble: 2 x 2
  solution   advantage 
  <fct>      <formttbl>
1 solution_1 0.00%     
2 solution_2 55.89%