[R] Whole genome searching of 100bp "D" sequence

From: Amos Folarin <amosfolarin_at_gmail.com>
Date: Fri, 15 Apr 2011 19:53:59 +0100


Hi,
I was wondering I'm going about this in the correct way. I need to test if there are coding sequences or exons in hg19 which match a string of 100bp
"D" i.e. [A,G or T]. However I'm getting a strange result.

I get a hit on chr7, using the 100bp search however when I search with 60bp sequence of "D" I don't get any hits.

library("BSgenome")
library("Biostrings")
library("BSgenome.Hsapiens.UCSC.hg19")
library("biomaRt")
library("GenomicFeatures")




#extract the alignments which match real genes #(add this onto stefans script req to buid the gr object using the regions from the BSgenome alignment of the 100bp seqs.)

txdb <- makeTranscriptDbFromUCSC(genome = "hg19", tablename = "knownGene") ## do once locally & save

#query.plus <-

DNAString("DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD")
# 100bp C free sequence
query.plus <-
DNAString("DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD") #
60bp C free sequence
query.minus <- reverseComplement(query.plus)

chrList <- c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8",
"chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16",
"chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", "chrY")

#access/group the by subset of annotation annotGr <- exons(txdb)
#annotGr <- cds(txdb)

wholeGenomeMatch <- function()
{

    for(i in chrList)
    {
    #matches on plus strand
    matches.plus.strand <- matchPattern(query.plus, Hsapiens[[i]], fixed=FALSE, max.mismatch=5)

    mp <- as.matrix(matches.plus.strand)

    #matches on minus strand
    matches.minus.strand <- matchPattern(query.minus, Hsapiens[[i]], fixed=FALSE, max.mismatch=5)

    mm <- as.matrix(matches.minus.strand)

    OL.p <- NULL
    if(nrow(mp) > 0)
    {
    ##### MATCH THE POSITIVE STRAND ######     #need to get start and end positions (end = start + (length-1))     gr <- GRanges(
    seqnames = rep(i,nrow(mp)),
    ranges = IRanges(start = mp[[1]],
    end = mp[[1]]+(mp[,2]-1)),
    strand = rep("+", nrow(mp)))

    #OL <- findOverlaps(query=gr, subject=annotGr)     OL.p <- annotGr[(!is.na(match(annotGr, gr)))]     #as.data.frame(OL) ## view the results

    #tdata.p <- annotGr[unique(queryHits(OL)),]     #tdata.p <- as.data.frame(tdata.p, row.names = NULL, optional = FALSE)

    #write.table(tdata.p, paste(i,"plus.txt"))     cat( paste(i,"plus.txt\n"))
    }

    OL.m <- NULL
    if(nrow(mm) > 0)
    {
    ##### MATCH THE MINUS STRAND ########     #need to get start and end positions (end = start + (length-1))     gr <- GRanges(
    seqnames = rep(i,nrow(mm)),
    ranges = IRanges(start = mm[[1]],
    end = mm[[1]]+(mm[,2]-1)),
    strand = rep("-", nrow(mm)))

    #OL <- findOverlaps(query=gr, subject=annotGr)     OL.m <- annotGr[(!is.na(match(annotGr, gr)))]     #tdata.m <- annotGr[unique(queryHits(OL)),]     #tdata.m <- as.data.frame(tdata.m, row.names = NULL, optional = FALSE)

    #write.table(tdata.m, paste(i,"minus.txt"))

    cat( paste(i,"minus.txt\n"))

    #write.table(rbind(tdata.p, tdata.p), paste(i, ".txt"))     }

    ##write all matching results into one file     write.table(rbind(as.data.frame(OL.p), as.data.frame(OL.m)), file="60bp_v_exons_max-mismatch=5.txt", append=TRUE)

    }

}

wholeGenomeMatch()

#in the first instance with the 100bp search I get 1 hit in the output file:
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"1" "chr7" 420815 422845 2031 "+" 97277
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"

#with the 60 bp query string my output file reads no hits
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"
"seqnames" "start" "end" "width" "strand" "exon_id"

Your help is much appreciated,
Amos

        [[alternative HTML version deleted]]



R-help_at_r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code. Received on Fri 15 Apr 2011 - 18:57:09 GMT

Archive maintained by Robert King, hosted by the discipline of statistics at the University of Newcastle, Australia.
Archive generated by hypermail 2.2.0, at Fri 15 Apr 2011 - 22:30:30 GMT.

Mailing list information is available at https://stat.ethz.ch/mailman/listinfo/r-help. Please read the posting guide before posting to the list.

list of date sections of archive