Granges - (left) join
Solution 1:
This is easy to do using findOverlaps
library(GenomicRanges)
m <- findOverlaps(s, t)
# Add gc.name to subject GRanges (i.e. left join)
mcols(t)$gc.name <- NA
mcols(t)[subjectHits(m), "gc.name"] = mcols(s)[queryHits(m), "gc.name"]
t
#GRanges object with 2 ranges and 6 metadata columns:
# seqnames ranges strand | pvalue qvalue meth.diff gc.X
# <Rle> <IRanges> <Rle> | <numeric> <numeric> <numeric> <character>
# [1] MT [ 708, 708] + | 0.3361535 0.06058539 0.4743142 a
# [2] MT [1147, 1147] - | 0.4637233 0.19743361 0.3010486 b
# gc.score gc.name
# <numeric> <character>
# [1] 80 rs28412942
# [2] 80 <NA>
# -------
# seqinfo: 1 sequence from an unspecified genome; no seqlengths
Sample data
set.seed(2018)
t <- GRanges(
seqnames = c("MT", "MT"),
IRanges(start = c(708, 1147), end = c(708, 1147)),
strand = c("+", "-"),
pvalue = runif(2),
qvalue = runif(2),
meth.diff = runif(2),
gc.X = letters[1:2],
gc.score = rep(80, 2))
set.seed(2018)
s <- GRanges(
seqnames = "MT",
IRanges(start = 708, end = 708),
strand = "+",
pvalue = runif(1),
qvalue = runif(1),
meth.diff = runif(1),
gc.X = letters[3],
gc.name = "rs28412942",
gc.score = 0)
Solution 2:
The answer form @Mautirus Evers will fail in case there are multiple overlaps. findOverlaps
correctly identifies two overlaps but since the input query has only one line the second part with mcols
assignes only the second of the overlaps.
"Failed" test:
set.seed(2018)
t <- GRanges(
seqnames = "MT",
IRanges(start = 708, end = 708),
strand = "+",
pvalue = runif(1),
qvalue = runif(1),
meth.diff = runif(1),
gc.X = letters[1],
gc.score = rep(80, 1))
t
# GRanges object with 1 range and 5 metadata columns:
# seqnames ranges strand | pvalue qvalue meth.diff gc.X gc.score
# <Rle> <IRanges> <Rle> | <numeric> <numeric> <numeric> <character> <numeric>
# [1] MT 708 + | 0.336153471143916 0.463723270455375 0.0605853858869523 a 80
# -------
# seqinfo: 1 sequence from an unspecified genome; no seqlengths
set.seed(2018)
s <- GRanges(
seqnames = "MT",
IRanges(start = c(708, 708), end = c(708, 708)),
strand = c("+", "+"),
pvalue = runif(2),
qvalue = runif(2),
meth.diff = runif(2),
gc.X = letters[3:4],
gc.name = c("rs28412942", "rs28412942dupl"),
gc.score = 0)
s
# GRanges object with 2 ranges and 6 metadata columns:
# seqnames ranges strand | pvalue qvalue meth.diff gc.X gc.name gc.score
# <Rle> <IRanges> <Rle> | <numeric> <numeric> <numeric> <character> <character> <numeric>
# [1] MT 708 + | 0.336153471143916 0.0605853858869523 0.474314193474129 c rs28412942 0
# [2] MT 708 + | 0.463723270455375 0.197433612542227 0.301048603840172 d rs28412942dupl 0
# -------
# seqinfo: 1 sequence from an unspecified genome; no seqlengths
library(GenomicRanges)
m <- findOverlaps(s, t, type = "any", select = "all")
m
#Hits object with 2 hits and 0 metadata columns:
# queryHits subjectHits
# <integer> <integer>
# [1] 1 1
# [2] 2 1
# -------
# queryLength: 2 / subjectLength: 1
# Add gc.name to subject GRanges (i.e. left join)
mcols(t)$gc.name <- NA
mcols(t)[subjectHits(m), "gc.name"] <- mcols(s)[queryHits(m), "gc.name"]
t
# GRanges object with 1 range and 6 metadata columns:
# seqnames ranges strand | pvalue qvalue meth.diff gc.X gc.score gc.name
# <Rle> <IRanges> <Rle> | <numeric> <numeric> <numeric> <character> <numeric> <character>
# [1] MT 708 + | 0.336153471143916 0.463723270455375 0.0605853858869523 a 80 rs28412942dupl
# -------
# seqinfo: 1 sequence from an unspecified genome; no seqlengths
I ended up with using plyranges join_overlap_left_directed (or join_overlap_left) and then filtering whatever columns I needed to keep.
plyranges solution:
set.seed(2018)
t <- GRanges(
seqnames = c("MT", "MT"),
IRanges(start = c(708, 800), end = c(708, 800)),
strand = c("+", "-"),
pvalue = runif(2),
qvalue = runif(2),
meth.diff = runif(2),
gc.X = letters[1:2],
gc.score = rep(80, 2))
set.seed(2018)
s <- GRanges(
seqnames = "MT",
IRanges(start = c(708, 708), end = c(708, 708)),
strand = c("+", "+"),
pvalue = runif(2),
qvalue = runif(2),
meth.diff = runif(2),
gc.X = letters[3:4],
gc.name = c("rs28412942", "rs28412942dupl"),
gc.score = 0)
library(plyranges)
t <- join_overlap_left_directed(t, s)
t
# GRanges object with 3 ranges and 11 metadata columns:
# seqnames ranges strand | pvalue.x qvalue.x meth.diff.x gc.X.x gc.score.x pvalue.y qvalue.y meth.diff.y gc.X.y gc.name gc.score.y
# <Rle> <IRanges> <Rle> | <numeric> <numeric> <numeric> <character> <numeric> <numeric> <numeric> <numeric> <character> <character> <numeric>
# [1] MT 708 + | 0.336153471143916 0.0605853858869523 0.474314193474129 a 80 0.336153471143916 0.0605853858869523 0.474314193474129 c rs28412942 0
# [2] MT 708 + | 0.336153471143916 0.0605853858869523 0.474314193474129 a 80 0.463723270455375 0.197433612542227 0.301048603840172 d rs28412942dupl 0
# [3] MT 800 - | 0.463723270455375 0.197433612542227 0.301048603840172 b 80 <NA> <NA> <NA> <NA> <NA> <NA>
# -------
# seqinfo: 2 sequences from an unspecified genome; no seqlengths
P.S. I apologize for adding this as an answer but I cannot make comments due to my low reputation.