Last updated on 2025-03-01 06:51:23 CET.
Flavor | Version | Tinstall | Tcheck | Ttotal | Status | Flags |
---|---|---|---|---|---|---|
r-devel-linux-x86_64-debian-clang | 0.2.8 | 45.39 | 407.37 | 452.76 | ERROR | |
r-devel-linux-x86_64-debian-gcc | 0.2.8 | 36.29 | 465.70 | 501.99 | ERROR | |
r-devel-linux-x86_64-fedora-clang | 0.2.8 | 761.06 | ERROR | |||
r-devel-linux-x86_64-fedora-gcc | 0.2.8 | 1078.52 | ERROR | |||
r-devel-macos-arm64 | 0.2.8 | 457.00 | NOTE | |||
r-devel-macos-x86_64 | 0.2.8 | 677.00 | NOTE | |||
r-devel-windows-x86_64 | 0.2.8 | 62.00 | 531.00 | 593.00 | NOTE | |
r-patched-linux-x86_64 | 0.2.8 | 51.06 | 1189.44 | 1240.50 | NOTE | |
r-release-linux-x86_64 | 0.2.8 | 47.99 | 1132.98 | 1180.97 | NOTE | |
r-release-macos-arm64 | 0.2.8 | 275.00 | NOTE | |||
r-release-macos-x86_64 | 0.2.8 | 363.00 | NOTE | |||
r-release-windows-x86_64 | 0.2.8 | 59.00 | 540.00 | 599.00 | NOTE | |
r-oldrel-macos-arm64 | 0.2.8 | NOTE | ||||
r-oldrel-macos-x86_64 | 0.2.8 | 840.00 | NOTE | |||
r-oldrel-windows-x86_64 | 0.2.8 | 73.00 | 758.00 | 831.00 | NOTE |
Version: 0.2.8
Check: compiled code
Result: NOTE
File ‘seqtrie/libs/seqtrie.so’:
Found non-API call to R: ‘STRING_PTR’
Compiled code should not call non-API entry points in R.
See ‘Writing portable packages’ in the ‘Writing R Extensions’ manual,
and section ‘Moving into C API compliance’ for issues with the use of
non-API entry points.
Flavors: r-devel-linux-x86_64-debian-clang, r-devel-linux-x86_64-debian-gcc, r-devel-linux-x86_64-fedora-clang, r-devel-linux-x86_64-fedora-gcc, r-devel-macos-arm64, r-devel-macos-x86_64
Version: 0.2.8
Check: tests
Result: ERROR
Running ‘test_RadixForest.R’ [16s/14s]
Running ‘test_RadixTree.R’ [268s/179s]
Running ‘test_pairwise.R’ [79s/55s]
Running the tests in ‘tests/test_pairwise.R’ failed.
Complete output:
> # This test file tests the `dist_matrix` and `dist_pairwise` functions
> # These two functions are simple dynamic programming algorithms for computing pairwise distances and are themselves used to validate
> # the RadixTree imeplementation (see test_radix_tree.R)
>
> if(requireNamespace("seqtrie", quietly=TRUE) &&
+ requireNamespace("stringi", quietly=TRUE) &&
+ requireNamespace("stringdist", quietly=TRUE) &&
+ requireNamespace("Biostrings", quietly=TRUE) &&
+ requireNamespace("dplyr", quietly=TRUE) &&
+ # pwalign is only required for Biostrings >= 2.72.0 in R 4.4+
+ ( (packageVersion("Biostrings") < "2.72.0") || requireNamespace("pwalign", quietly=TRUE) )
+ ) {
+
+ library(seqtrie)
+ library(stringi)
+ library(stringdist)
+ library(Biostrings)
+ library(dplyr)
+
+ # Use 2 threads on github actions and CRAN, 4 threads locally
+ IS_LOCAL <- Sys.getenv("IS_LOCAL") != ""
+ NTHREADS <- ifelse(IS_LOCAL, 4, 2)
+ NITER <- ifelse(IS_LOCAL, 4, 1)
+ NSEQS <- 10000
+ MAXSEQLEN <- 200
+ CHARSET <- "ACGT"
+
+ random_strings <- function(N, charset = "abcdefghijklmnopqrstuvwxyz") {
+ charset_stri <- paste0("[", charset, "]")
+ len <- sample(0:MAXSEQLEN, N, replace=TRUE)
+ result <- lapply(0:MAXSEQLEN, function(x) {
+ nx <- sum(len == x)
+ if(nx == 0) return(character())
+ stringi::stri_rand_strings(nx, x, pattern = charset_stri)
+ })
+ sample(unlist(result))
+ }
+
+ mutate_strings <- function(x, prob = 0.025, indel_prob = 0.025, charset = "abcdefghijklmnopqrstuvwxyz") {
+ charset <- unlist(strsplit(charset, ""))
+ xsplit <- strsplit(x, "")
+ sapply(xsplit, function(a) {
+ r <- runif(length(a)) < prob
+ a[r] <- sample(charset, sum(r), replace=TRUE)
+ ins <- runif(length(a)) < indel_prob
+ a[ins] <- paste0(sample(charset, sum(ins), replace=TRUE), sample(charset, sum(ins), replace=TRUE))
+ del <- runif(length(a)) < indel_prob
+ a[del] <- ""
+ paste0(a, collapse = "")
+ })
+ }
+
+ # Biostrings notes:
+ # subject (target) must be of length 1 or equal to pattern (query)
+ # To get a distance matrix, iterate over target and perform a column bind
+ # special_zero_case -- if both query and target are empty, Biostrings fails with an error
+ pairwiseAlignmentFix <- function(pattern, subject, ...) {
+ results <- rep(0, length(subject))
+ special_zero_case <- nchar(pattern) == 0 & nchar(subject) == 0
+ if(all(special_zero_case)) {
+ results
+ } else {
+ results[!special_zero_case] <- Biostrings::pairwiseAlignment(pattern=pattern[!special_zero_case], subject=subject[!special_zero_case], ...)
+ results
+ }
+ }
+
+ biostrings_matrix_global <- function(query, target, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ lapply(query, function(x) {
+ query2 <- rep(x, length(target))
+ -pairwiseAlignmentFix(pattern=query2, subject=target, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }) %>% do.call(rbind, .)
+ }
+
+ biostrings_pairwise_global <- function(query, target, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ -pairwiseAlignment(pattern=query, subject=target, substitutionMatrix = substitutionMatrix,gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }
+
+ biostrings_matrix_anchored <- function(query, target, query_size, target_size, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ lapply(seq_along(query), function(i) {
+ query2 <- substring(query[i], 1, query_size[i,,drop=TRUE])
+ target2 <- substring(target, 1, target_size[i,,drop=TRUE])
+ -pairwiseAlignmentFix(pattern=query2, subject=target2, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }) %>% do.call(rbind, .)
+ }
+
+ biostrings_pairwise_anchored <- function(query, target, query_size, target_size, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ query2 <- substring(query, 1, query_size)
+ target2 <- substring(target, 1, target_size)
+ -pairwiseAlignmentFix(pattern=query2, subject=target2, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }
+
+ for(. in 1:NITER) {
+
+ print("Checking hamming search correctness")
+ local({
+ # Note: seqtrie returns `NA_integer_` for hamming distance when the lengths are different
+ # whereas stringdist returns `Inf`
+ # This is why we need to replace `NA_integer_` with `Inf` when comparing results
+
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "hamming", nthreads=NTHREADS)
+ results_seqtrie[is.na(results_seqtrie)] <- Inf
+ results_stringdist <- stringdist::stringdistmatrix(query, target, method = "hamming", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "hamming", nthreads=NTHREADS)
+ results_seqtrie[is.na(results_seqtrie)] <- Inf
+ results_stringdist <- stringdist::stringdist(query_pairwise, target, method = "hamming", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking levenshtein search correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", nthreads=NTHREADS)
+ results_stringdist <- stringdist::stringdistmatrix(query, target, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", nthreads=NTHREADS)
+ results_stringdist <- stringdist::stringdist(query_pairwise, target, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking anchored search correctness")
+ local({
+ # There is no anchored search in stringdist (or elsewhere). To get the same results, we substring the query and target sequences
+ # By the results of the seqtrie anchored search and then compare the results
+
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_stringdist <- lapply(seq_along(query), function(i) {
+ query_size2 <- query_size[i,,drop=TRUE]
+ target_size2 <- target_size[i,,drop=TRUE]
+ query2 <- substring(query[i], 1, query_size2) # query[i] is recycled
+ target2 <- substring(target, 1, target_size2)
+ stringdist::stringdist(query2, target2, method = "lv", nthread=NTHREADS)
+ }) %>% do.call(rbind, .)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ query2 <- substring(query_pairwise, 1, query_size)
+ target2 <- substring(target, 1, target_size)
+ results_stringdist <- stringdist::stringdist(query2, target2, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking global search with linear gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_matrix_global(query, target, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_pairwise_global(query_pairwise, target, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+ print("Checking anchored search with linear gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_matrix_anchored(query, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_pairwise_anchored(query_pairwise, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+
+ print("Checking global search with affine gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ gap_open_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cos=gap_open_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_matrix_global(query, target, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_pairwise_global(query_pairwise, target, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+ print("Checking anchored search with affine gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ gap_open_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_matrix_anchored(query, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_pairwise_anchored(query_pairwise, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+ }
+
+ }
Loading required package: BiocGenerics
Loading required package: generics
Attaching package: 'generics'
The following objects are masked from 'package:base':
as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
setequal, union
Attaching package: 'BiocGenerics'
The following objects are masked from 'package:stats':
IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':
Filter, Find, Map, Position, Reduce, anyDuplicated, aperm, append,
as.data.frame, basename, cbind, colnames, dirname, do.call,
duplicated, eval, evalq, get, grep, grepl, is.unsorted, lapply,
mapply, match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
rank, rbind, rownames, sapply, saveRDS, table, tapply, unique,
unsplit, which.max, which.min
Loading required package: S4Vectors
Loading required package: stats4
Attaching package: 'S4Vectors'
The following object is masked from 'package:utils':
findMatches
The following objects are masked from 'package:base':
I, expand.grid, unname
Loading required package: IRanges
Loading required package: XVector
Loading required package: GenomeInfoDb
Attaching package: 'Biostrings'
The following object is masked from 'package:base':
strsplit
Attaching package: 'dplyr'
The following objects are masked from 'package:Biostrings':
collapse, intersect, setdiff, setequal, union
The following object is masked from 'package:GenomeInfoDb':
intersect
The following object is masked from 'package:XVector':
slice
The following objects are masked from 'package:IRanges':
collapse, desc, intersect, setdiff, slice, union
The following objects are masked from 'package:S4Vectors':
first, intersect, rename, setdiff, setequal, union
The following objects are masked from 'package:BiocGenerics':
combine, intersect, setdiff, setequal, union
The following object is masked from 'package:generics':
explain
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
[1] "Checking hamming search correctness"
[1] "Checking levenshtein search correctness"
[1] "Checking anchored search correctness"
[1] "Checking global search with linear gap for correctness"
Error in h(simpleError(msg, call)) :
error in evaluating the argument 'args' in selecting a method for function 'do.call': 'recursive' must be a length-1 vector
Calls: <Anonymous> ... mpi.XStringSet.pairwiseAlignment -> XStringSet.pairwiseAlignment -> array -> unlist
In addition: Warning message:
In .call_fun_in_pwalign("pairwiseAlignment", ...) :
pairwiseAlignment() has moved from Biostrings to the pwalign package, and is
formally deprecated in Biostrings >= 2.75.1. Please call
pwalign::pairwiseAlignment() to get rid of this warning.
Execution halted
Flavor: r-devel-linux-x86_64-debian-clang
Version: 0.2.8
Check: tests
Result: ERROR
Running ‘test_RadixForest.R’ [20s/18s]
Running ‘test_RadixTree.R’ [335s/207s]
Running ‘test_pairwise.R’ [77s/59s]
Running the tests in ‘tests/test_pairwise.R’ failed.
Complete output:
> # This test file tests the `dist_matrix` and `dist_pairwise` functions
> # These two functions are simple dynamic programming algorithms for computing pairwise distances and are themselves used to validate
> # the RadixTree imeplementation (see test_radix_tree.R)
>
> if(requireNamespace("seqtrie", quietly=TRUE) &&
+ requireNamespace("stringi", quietly=TRUE) &&
+ requireNamespace("stringdist", quietly=TRUE) &&
+ requireNamespace("Biostrings", quietly=TRUE) &&
+ requireNamespace("dplyr", quietly=TRUE) &&
+ # pwalign is only required for Biostrings >= 2.72.0 in R 4.4+
+ ( (packageVersion("Biostrings") < "2.72.0") || requireNamespace("pwalign", quietly=TRUE) )
+ ) {
+
+ library(seqtrie)
+ library(stringi)
+ library(stringdist)
+ library(Biostrings)
+ library(dplyr)
+
+ # Use 2 threads on github actions and CRAN, 4 threads locally
+ IS_LOCAL <- Sys.getenv("IS_LOCAL") != ""
+ NTHREADS <- ifelse(IS_LOCAL, 4, 2)
+ NITER <- ifelse(IS_LOCAL, 4, 1)
+ NSEQS <- 10000
+ MAXSEQLEN <- 200
+ CHARSET <- "ACGT"
+
+ random_strings <- function(N, charset = "abcdefghijklmnopqrstuvwxyz") {
+ charset_stri <- paste0("[", charset, "]")
+ len <- sample(0:MAXSEQLEN, N, replace=TRUE)
+ result <- lapply(0:MAXSEQLEN, function(x) {
+ nx <- sum(len == x)
+ if(nx == 0) return(character())
+ stringi::stri_rand_strings(nx, x, pattern = charset_stri)
+ })
+ sample(unlist(result))
+ }
+
+ mutate_strings <- function(x, prob = 0.025, indel_prob = 0.025, charset = "abcdefghijklmnopqrstuvwxyz") {
+ charset <- unlist(strsplit(charset, ""))
+ xsplit <- strsplit(x, "")
+ sapply(xsplit, function(a) {
+ r <- runif(length(a)) < prob
+ a[r] <- sample(charset, sum(r), replace=TRUE)
+ ins <- runif(length(a)) < indel_prob
+ a[ins] <- paste0(sample(charset, sum(ins), replace=TRUE), sample(charset, sum(ins), replace=TRUE))
+ del <- runif(length(a)) < indel_prob
+ a[del] <- ""
+ paste0(a, collapse = "")
+ })
+ }
+
+ # Biostrings notes:
+ # subject (target) must be of length 1 or equal to pattern (query)
+ # To get a distance matrix, iterate over target and perform a column bind
+ # special_zero_case -- if both query and target are empty, Biostrings fails with an error
+ pairwiseAlignmentFix <- function(pattern, subject, ...) {
+ results <- rep(0, length(subject))
+ special_zero_case <- nchar(pattern) == 0 & nchar(subject) == 0
+ if(all(special_zero_case)) {
+ results
+ } else {
+ results[!special_zero_case] <- Biostrings::pairwiseAlignment(pattern=pattern[!special_zero_case], subject=subject[!special_zero_case], ...)
+ results
+ }
+ }
+
+ biostrings_matrix_global <- function(query, target, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ lapply(query, function(x) {
+ query2 <- rep(x, length(target))
+ -pairwiseAlignmentFix(pattern=query2, subject=target, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }) %>% do.call(rbind, .)
+ }
+
+ biostrings_pairwise_global <- function(query, target, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ -pairwiseAlignment(pattern=query, subject=target, substitutionMatrix = substitutionMatrix,gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }
+
+ biostrings_matrix_anchored <- function(query, target, query_size, target_size, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ lapply(seq_along(query), function(i) {
+ query2 <- substring(query[i], 1, query_size[i,,drop=TRUE])
+ target2 <- substring(target, 1, target_size[i,,drop=TRUE])
+ -pairwiseAlignmentFix(pattern=query2, subject=target2, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }) %>% do.call(rbind, .)
+ }
+
+ biostrings_pairwise_anchored <- function(query, target, query_size, target_size, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ query2 <- substring(query, 1, query_size)
+ target2 <- substring(target, 1, target_size)
+ -pairwiseAlignmentFix(pattern=query2, subject=target2, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }
+
+ for(. in 1:NITER) {
+
+ print("Checking hamming search correctness")
+ local({
+ # Note: seqtrie returns `NA_integer_` for hamming distance when the lengths are different
+ # whereas stringdist returns `Inf`
+ # This is why we need to replace `NA_integer_` with `Inf` when comparing results
+
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "hamming", nthreads=NTHREADS)
+ results_seqtrie[is.na(results_seqtrie)] <- Inf
+ results_stringdist <- stringdist::stringdistmatrix(query, target, method = "hamming", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "hamming", nthreads=NTHREADS)
+ results_seqtrie[is.na(results_seqtrie)] <- Inf
+ results_stringdist <- stringdist::stringdist(query_pairwise, target, method = "hamming", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking levenshtein search correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", nthreads=NTHREADS)
+ results_stringdist <- stringdist::stringdistmatrix(query, target, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", nthreads=NTHREADS)
+ results_stringdist <- stringdist::stringdist(query_pairwise, target, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking anchored search correctness")
+ local({
+ # There is no anchored search in stringdist (or elsewhere). To get the same results, we substring the query and target sequences
+ # By the results of the seqtrie anchored search and then compare the results
+
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_stringdist <- lapply(seq_along(query), function(i) {
+ query_size2 <- query_size[i,,drop=TRUE]
+ target_size2 <- target_size[i,,drop=TRUE]
+ query2 <- substring(query[i], 1, query_size2) # query[i] is recycled
+ target2 <- substring(target, 1, target_size2)
+ stringdist::stringdist(query2, target2, method = "lv", nthread=NTHREADS)
+ }) %>% do.call(rbind, .)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ query2 <- substring(query_pairwise, 1, query_size)
+ target2 <- substring(target, 1, target_size)
+ results_stringdist <- stringdist::stringdist(query2, target2, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking global search with linear gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_matrix_global(query, target, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_pairwise_global(query_pairwise, target, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+ print("Checking anchored search with linear gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_matrix_anchored(query, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_pairwise_anchored(query_pairwise, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+
+ print("Checking global search with affine gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ gap_open_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cos=gap_open_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_matrix_global(query, target, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_pairwise_global(query_pairwise, target, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+ print("Checking anchored search with affine gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ gap_open_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_matrix_anchored(query, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_pairwise_anchored(query_pairwise, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+ }
+
+ }
Loading required package: BiocGenerics
Loading required package: generics
Attaching package: 'generics'
The following objects are masked from 'package:base':
as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
setequal, union
Attaching package: 'BiocGenerics'
The following objects are masked from 'package:stats':
IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':
Filter, Find, Map, Position, Reduce, anyDuplicated, aperm, append,
as.data.frame, basename, cbind, colnames, dirname, do.call,
duplicated, eval, evalq, get, grep, grepl, is.unsorted, lapply,
mapply, match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
rank, rbind, rownames, sapply, saveRDS, table, tapply, unique,
unsplit, which.max, which.min
Loading required package: S4Vectors
Loading required package: stats4
Attaching package: 'S4Vectors'
The following object is masked from 'package:utils':
findMatches
The following objects are masked from 'package:base':
I, expand.grid, unname
Loading required package: IRanges
Loading required package: XVector
Loading required package: GenomeInfoDb
Attaching package: 'Biostrings'
The following object is masked from 'package:base':
strsplit
Attaching package: 'dplyr'
The following objects are masked from 'package:Biostrings':
collapse, intersect, setdiff, setequal, union
The following object is masked from 'package:GenomeInfoDb':
intersect
The following object is masked from 'package:XVector':
slice
The following objects are masked from 'package:IRanges':
collapse, desc, intersect, setdiff, slice, union
The following objects are masked from 'package:S4Vectors':
first, intersect, rename, setdiff, setequal, union
The following objects are masked from 'package:BiocGenerics':
combine, intersect, setdiff, setequal, union
The following object is masked from 'package:generics':
explain
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
[1] "Checking hamming search correctness"
[1] "Checking levenshtein search correctness"
[1] "Checking anchored search correctness"
[1] "Checking global search with linear gap for correctness"
Error in h(simpleError(msg, call)) :
error in evaluating the argument 'args' in selecting a method for function 'do.call': 'recursive' must be a length-1 vector
Calls: <Anonymous> ... mpi.XStringSet.pairwiseAlignment -> XStringSet.pairwiseAlignment -> array -> unlist
In addition: Warning message:
In .call_fun_in_pwalign("pairwiseAlignment", ...) :
pairwiseAlignment() has moved from Biostrings to the pwalign package, and is
formally deprecated in Biostrings >= 2.75.1. Please call
pwalign::pairwiseAlignment() to get rid of this warning.
Execution halted
Flavor: r-devel-linux-x86_64-debian-gcc
Version: 0.2.8
Check: tests
Result: ERROR
Running ‘test_RadixForest.R’ [30s/25s]
Running ‘test_RadixTree.R’ [449s/301s]
Running ‘test_pairwise.R’ [124s/86s]
Running the tests in ‘tests/test_pairwise.R’ failed.
Complete output:
> # This test file tests the `dist_matrix` and `dist_pairwise` functions
> # These two functions are simple dynamic programming algorithms for computing pairwise distances and are themselves used to validate
> # the RadixTree imeplementation (see test_radix_tree.R)
>
> if(requireNamespace("seqtrie", quietly=TRUE) &&
+ requireNamespace("stringi", quietly=TRUE) &&
+ requireNamespace("stringdist", quietly=TRUE) &&
+ requireNamespace("Biostrings", quietly=TRUE) &&
+ requireNamespace("dplyr", quietly=TRUE) &&
+ # pwalign is only required for Biostrings >= 2.72.0 in R 4.4+
+ ( (packageVersion("Biostrings") < "2.72.0") || requireNamespace("pwalign", quietly=TRUE) )
+ ) {
+
+ library(seqtrie)
+ library(stringi)
+ library(stringdist)
+ library(Biostrings)
+ library(dplyr)
+
+ # Use 2 threads on github actions and CRAN, 4 threads locally
+ IS_LOCAL <- Sys.getenv("IS_LOCAL") != ""
+ NTHREADS <- ifelse(IS_LOCAL, 4, 2)
+ NITER <- ifelse(IS_LOCAL, 4, 1)
+ NSEQS <- 10000
+ MAXSEQLEN <- 200
+ CHARSET <- "ACGT"
+
+ random_strings <- function(N, charset = "abcdefghijklmnopqrstuvwxyz") {
+ charset_stri <- paste0("[", charset, "]")
+ len <- sample(0:MAXSEQLEN, N, replace=TRUE)
+ result <- lapply(0:MAXSEQLEN, function(x) {
+ nx <- sum(len == x)
+ if(nx == 0) return(character())
+ stringi::stri_rand_strings(nx, x, pattern = charset_stri)
+ })
+ sample(unlist(result))
+ }
+
+ mutate_strings <- function(x, prob = 0.025, indel_prob = 0.025, charset = "abcdefghijklmnopqrstuvwxyz") {
+ charset <- unlist(strsplit(charset, ""))
+ xsplit <- strsplit(x, "")
+ sapply(xsplit, function(a) {
+ r <- runif(length(a)) < prob
+ a[r] <- sample(charset, sum(r), replace=TRUE)
+ ins <- runif(length(a)) < indel_prob
+ a[ins] <- paste0(sample(charset, sum(ins), replace=TRUE), sample(charset, sum(ins), replace=TRUE))
+ del <- runif(length(a)) < indel_prob
+ a[del] <- ""
+ paste0(a, collapse = "")
+ })
+ }
+
+ # Biostrings notes:
+ # subject (target) must be of length 1 or equal to pattern (query)
+ # To get a distance matrix, iterate over target and perform a column bind
+ # special_zero_case -- if both query and target are empty, Biostrings fails with an error
+ pairwiseAlignmentFix <- function(pattern, subject, ...) {
+ results <- rep(0, length(subject))
+ special_zero_case <- nchar(pattern) == 0 & nchar(subject) == 0
+ if(all(special_zero_case)) {
+ results
+ } else {
+ results[!special_zero_case] <- Biostrings::pairwiseAlignment(pattern=pattern[!special_zero_case], subject=subject[!special_zero_case], ...)
+ results
+ }
+ }
+
+ biostrings_matrix_global <- function(query, target, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ lapply(query, function(x) {
+ query2 <- rep(x, length(target))
+ -pairwiseAlignmentFix(pattern=query2, subject=target, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }) %>% do.call(rbind, .)
+ }
+
+ biostrings_pairwise_global <- function(query, target, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ -pairwiseAlignment(pattern=query, subject=target, substitutionMatrix = substitutionMatrix,gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }
+
+ biostrings_matrix_anchored <- function(query, target, query_size, target_size, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ lapply(seq_along(query), function(i) {
+ query2 <- substring(query[i], 1, query_size[i,,drop=TRUE])
+ target2 <- substring(target, 1, target_size[i,,drop=TRUE])
+ -pairwiseAlignmentFix(pattern=query2, subject=target2, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }) %>% do.call(rbind, .)
+ }
+
+ biostrings_pairwise_anchored <- function(query, target, query_size, target_size, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ query2 <- substring(query, 1, query_size)
+ target2 <- substring(target, 1, target_size)
+ -pairwiseAlignmentFix(pattern=query2, subject=target2, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }
+
+ for(. in 1:NITER) {
+
+ print("Checking hamming search correctness")
+ local({
+ # Note: seqtrie returns `NA_integer_` for hamming distance when the lengths are different
+ # whereas stringdist returns `Inf`
+ # This is why we need to replace `NA_integer_` with `Inf` when comparing results
+
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "hamming", nthreads=NTHREADS)
+ results_seqtrie[is.na(results_seqtrie)] <- Inf
+ results_stringdist <- stringdist::stringdistmatrix(query, target, method = "hamming", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "hamming", nthreads=NTHREADS)
+ results_seqtrie[is.na(results_seqtrie)] <- Inf
+ results_stringdist <- stringdist::stringdist(query_pairwise, target, method = "hamming", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking levenshtein search correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", nthreads=NTHREADS)
+ results_stringdist <- stringdist::stringdistmatrix(query, target, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", nthreads=NTHREADS)
+ results_stringdist <- stringdist::stringdist(query_pairwise, target, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking anchored search correctness")
+ local({
+ # There is no anchored search in stringdist (or elsewhere). To get the same results, we substring the query and target sequences
+ # By the results of the seqtrie anchored search and then compare the results
+
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_stringdist <- lapply(seq_along(query), function(i) {
+ query_size2 <- query_size[i,,drop=TRUE]
+ target_size2 <- target_size[i,,drop=TRUE]
+ query2 <- substring(query[i], 1, query_size2) # query[i] is recycled
+ target2 <- substring(target, 1, target_size2)
+ stringdist::stringdist(query2, target2, method = "lv", nthread=NTHREADS)
+ }) %>% do.call(rbind, .)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ query2 <- substring(query_pairwise, 1, query_size)
+ target2 <- substring(target, 1, target_size)
+ results_stringdist <- stringdist::stringdist(query2, target2, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking global search with linear gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_matrix_global(query, target, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_pairwise_global(query_pairwise, target, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+ print("Checking anchored search with linear gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_matrix_anchored(query, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_pairwise_anchored(query_pairwise, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+
+ print("Checking global search with affine gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ gap_open_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cos=gap_open_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_matrix_global(query, target, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_pairwise_global(query_pairwise, target, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+ print("Checking anchored search with affine gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ gap_open_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_matrix_anchored(query, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_pairwise_anchored(query_pairwise, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+ }
+
+ }
Loading required package: BiocGenerics
Loading required package: generics
Attaching package: 'generics'
The following objects are masked from 'package:base':
as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
setequal, union
Attaching package: 'BiocGenerics'
The following objects are masked from 'package:stats':
IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':
Filter, Find, Map, Position, Reduce, anyDuplicated, aperm, append,
as.data.frame, basename, cbind, colnames, dirname, do.call,
duplicated, eval, evalq, get, grep, grepl, is.unsorted, lapply,
mapply, match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
rank, rbind, rownames, sapply, saveRDS, table, tapply, unique,
unsplit, which.max, which.min
Loading required package: S4Vectors
Loading required package: stats4
Attaching package: 'S4Vectors'
The following object is masked from 'package:utils':
findMatches
The following objects are masked from 'package:base':
I, expand.grid, unname
Loading required package: IRanges
Loading required package: XVector
Loading required package: GenomeInfoDb
Attaching package: 'Biostrings'
The following object is masked from 'package:base':
strsplit
Attaching package: 'dplyr'
The following objects are masked from 'package:Biostrings':
collapse, intersect, setdiff, setequal, union
The following object is masked from 'package:GenomeInfoDb':
intersect
The following object is masked from 'package:XVector':
slice
The following objects are masked from 'package:IRanges':
collapse, desc, intersect, setdiff, slice, union
The following objects are masked from 'package:S4Vectors':
first, intersect, rename, setdiff, setequal, union
The following objects are masked from 'package:BiocGenerics':
combine, intersect, setdiff, setequal, union
The following object is masked from 'package:generics':
explain
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
[1] "Checking hamming search correctness"
[1] "Checking levenshtein search correctness"
[1] "Checking anchored search correctness"
[1] "Checking global search with linear gap for correctness"
Error in h(simpleError(msg, call)) :
error in evaluating the argument 'args' in selecting a method for function 'do.call': 'recursive' must be a length-1 vector
Calls: <Anonymous> ... mpi.XStringSet.pairwiseAlignment -> XStringSet.pairwiseAlignment -> array -> unlist
In addition: Warning message:
In .call_fun_in_pwalign("pairwiseAlignment", ...) :
pairwiseAlignment() has moved from Biostrings to the pwalign package, and is
formally deprecated in Biostrings >= 2.75.1. Please call
pwalign::pairwiseAlignment() to get rid of this warning.
Execution halted
Flavor: r-devel-linux-x86_64-fedora-clang
Version: 0.2.8
Check: tests
Result: ERROR
Running ‘test_RadixForest.R’ [24s/19s]
Running ‘test_RadixTree.R’ [694s/361s]
Running ‘test_pairwise.R’ [177s/99s]
Running the tests in ‘tests/test_pairwise.R’ failed.
Complete output:
> # This test file tests the `dist_matrix` and `dist_pairwise` functions
> # These two functions are simple dynamic programming algorithms for computing pairwise distances and are themselves used to validate
> # the RadixTree imeplementation (see test_radix_tree.R)
>
> if(requireNamespace("seqtrie", quietly=TRUE) &&
+ requireNamespace("stringi", quietly=TRUE) &&
+ requireNamespace("stringdist", quietly=TRUE) &&
+ requireNamespace("Biostrings", quietly=TRUE) &&
+ requireNamespace("dplyr", quietly=TRUE) &&
+ # pwalign is only required for Biostrings >= 2.72.0 in R 4.4+
+ ( (packageVersion("Biostrings") < "2.72.0") || requireNamespace("pwalign", quietly=TRUE) )
+ ) {
+
+ library(seqtrie)
+ library(stringi)
+ library(stringdist)
+ library(Biostrings)
+ library(dplyr)
+
+ # Use 2 threads on github actions and CRAN, 4 threads locally
+ IS_LOCAL <- Sys.getenv("IS_LOCAL") != ""
+ NTHREADS <- ifelse(IS_LOCAL, 4, 2)
+ NITER <- ifelse(IS_LOCAL, 4, 1)
+ NSEQS <- 10000
+ MAXSEQLEN <- 200
+ CHARSET <- "ACGT"
+
+ random_strings <- function(N, charset = "abcdefghijklmnopqrstuvwxyz") {
+ charset_stri <- paste0("[", charset, "]")
+ len <- sample(0:MAXSEQLEN, N, replace=TRUE)
+ result <- lapply(0:MAXSEQLEN, function(x) {
+ nx <- sum(len == x)
+ if(nx == 0) return(character())
+ stringi::stri_rand_strings(nx, x, pattern = charset_stri)
+ })
+ sample(unlist(result))
+ }
+
+ mutate_strings <- function(x, prob = 0.025, indel_prob = 0.025, charset = "abcdefghijklmnopqrstuvwxyz") {
+ charset <- unlist(strsplit(charset, ""))
+ xsplit <- strsplit(x, "")
+ sapply(xsplit, function(a) {
+ r <- runif(length(a)) < prob
+ a[r] <- sample(charset, sum(r), replace=TRUE)
+ ins <- runif(length(a)) < indel_prob
+ a[ins] <- paste0(sample(charset, sum(ins), replace=TRUE), sample(charset, sum(ins), replace=TRUE))
+ del <- runif(length(a)) < indel_prob
+ a[del] <- ""
+ paste0(a, collapse = "")
+ })
+ }
+
+ # Biostrings notes:
+ # subject (target) must be of length 1 or equal to pattern (query)
+ # To get a distance matrix, iterate over target and perform a column bind
+ # special_zero_case -- if both query and target are empty, Biostrings fails with an error
+ pairwiseAlignmentFix <- function(pattern, subject, ...) {
+ results <- rep(0, length(subject))
+ special_zero_case <- nchar(pattern) == 0 & nchar(subject) == 0
+ if(all(special_zero_case)) {
+ results
+ } else {
+ results[!special_zero_case] <- Biostrings::pairwiseAlignment(pattern=pattern[!special_zero_case], subject=subject[!special_zero_case], ...)
+ results
+ }
+ }
+
+ biostrings_matrix_global <- function(query, target, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ lapply(query, function(x) {
+ query2 <- rep(x, length(target))
+ -pairwiseAlignmentFix(pattern=query2, subject=target, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }) %>% do.call(rbind, .)
+ }
+
+ biostrings_pairwise_global <- function(query, target, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ -pairwiseAlignment(pattern=query, subject=target, substitutionMatrix = substitutionMatrix,gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }
+
+ biostrings_matrix_anchored <- function(query, target, query_size, target_size, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ lapply(seq_along(query), function(i) {
+ query2 <- substring(query[i], 1, query_size[i,,drop=TRUE])
+ target2 <- substring(target, 1, target_size[i,,drop=TRUE])
+ -pairwiseAlignmentFix(pattern=query2, subject=target2, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }) %>% do.call(rbind, .)
+ }
+
+ biostrings_pairwise_anchored <- function(query, target, query_size, target_size, cost_matrix, gap_cost, gap_open_cost = 0) {
+ substitutionMatrix <- -cost_matrix
+ query2 <- substring(query, 1, query_size)
+ target2 <- substring(target, 1, target_size)
+ -pairwiseAlignmentFix(pattern=query2, subject=target2, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }
+
+ for(. in 1:NITER) {
+
+ print("Checking hamming search correctness")
+ local({
+ # Note: seqtrie returns `NA_integer_` for hamming distance when the lengths are different
+ # whereas stringdist returns `Inf`
+ # This is why we need to replace `NA_integer_` with `Inf` when comparing results
+
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "hamming", nthreads=NTHREADS)
+ results_seqtrie[is.na(results_seqtrie)] <- Inf
+ results_stringdist <- stringdist::stringdistmatrix(query, target, method = "hamming", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "hamming", nthreads=NTHREADS)
+ results_seqtrie[is.na(results_seqtrie)] <- Inf
+ results_stringdist <- stringdist::stringdist(query_pairwise, target, method = "hamming", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking levenshtein search correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", nthreads=NTHREADS)
+ results_stringdist <- stringdist::stringdistmatrix(query, target, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", nthreads=NTHREADS)
+ results_stringdist <- stringdist::stringdist(query_pairwise, target, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking anchored search correctness")
+ local({
+ # There is no anchored search in stringdist (or elsewhere). To get the same results, we substring the query and target sequences
+ # By the results of the seqtrie anchored search and then compare the results
+
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_stringdist <- lapply(seq_along(query), function(i) {
+ query_size2 <- query_size[i,,drop=TRUE]
+ target_size2 <- target_size[i,,drop=TRUE]
+ query2 <- substring(query[i], 1, query_size2) # query[i] is recycled
+ target2 <- substring(target, 1, target_size2)
+ stringdist::stringdist(query2, target2, method = "lv", nthread=NTHREADS)
+ }) %>% do.call(rbind, .)
+ stopifnot(all(results_seqtrie == results_stringdist))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ query2 <- substring(query_pairwise, 1, query_size)
+ target2 <- substring(target, 1, target_size)
+ results_stringdist <- stringdist::stringdist(query2, target2, method = "lv", nthread=NTHREADS)
+ stopifnot(all(results_seqtrie == results_stringdist))
+ })
+
+ print("Checking global search with linear gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_matrix_global(query, target, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_pairwise_global(query_pairwise, target, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+ print("Checking anchored search with linear gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_matrix_anchored(query, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_pairwise_anchored(query_pairwise, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+
+ print("Checking global search with affine gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ gap_open_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cos=gap_open_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_matrix_global(query, target, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ results_biostrings <- biostrings_pairwise_global(query_pairwise, target, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+
+ print("Checking anchored search with affine gap for correctness")
+ local({
+ target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+ query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+ query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+
+ # Check matrix results
+ cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+ diag(cost_matrix) <- 0
+ colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+ gap_cost <- sample(1:3, size = 1)
+ gap_open_cost <- sample(1:3, size = 1)
+ results_seqtrie <- dist_matrix(query, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_matrix_anchored(query, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+
+ # Check pairwise results
+ query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+ results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+ query_size <- attr(results_seqtrie, "query_size")
+ target_size <- attr(results_seqtrie, "target_size")
+ results_biostrings <- biostrings_pairwise_anchored(query_pairwise, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+ stopifnot(all(results_seqtrie == results_biostrings))
+ })
+ }
+
+ }
Loading required package: BiocGenerics
Loading required package: generics
Attaching package: 'generics'
The following objects are masked from 'package:base':
as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
setequal, union
Attaching package: 'BiocGenerics'
The following objects are masked from 'package:stats':
IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':
Filter, Find, Map, Position, Reduce, anyDuplicated, aperm, append,
as.data.frame, basename, cbind, colnames, dirname, do.call,
duplicated, eval, evalq, get, grep, grepl, is.unsorted, lapply,
mapply, match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
rank, rbind, rownames, sapply, saveRDS, table, tapply, unique,
unsplit, which.max, which.min
Loading required package: S4Vectors
Loading required package: stats4
Attaching package: 'S4Vectors'
The following object is masked from 'package:utils':
findMatches
The following objects are masked from 'package:base':
I, expand.grid, unname
Loading required package: IRanges
Loading required package: XVector
Loading required package: GenomeInfoDb
Attaching package: 'Biostrings'
The following object is masked from 'package:base':
strsplit
Attaching package: 'dplyr'
The following objects are masked from 'package:Biostrings':
collapse, intersect, setdiff, setequal, union
The following object is masked from 'package:GenomeInfoDb':
intersect
The following object is masked from 'package:XVector':
slice
The following objects are masked from 'package:IRanges':
collapse, desc, intersect, setdiff, slice, union
The following objects are masked from 'package:S4Vectors':
first, intersect, rename, setdiff, setequal, union
The following objects are masked from 'package:BiocGenerics':
combine, intersect, setdiff, setequal, union
The following object is masked from 'package:generics':
explain
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
[1] "Checking hamming search correctness"
[1] "Checking levenshtein search correctness"
[1] "Checking anchored search correctness"
[1] "Checking global search with linear gap for correctness"
Error in h(simpleError(msg, call)) :
error in evaluating the argument 'args' in selecting a method for function 'do.call': 'recursive' must be a length-1 vector
Calls: <Anonymous> ... mpi.XStringSet.pairwiseAlignment -> XStringSet.pairwiseAlignment -> array -> unlist
In addition: Warning message:
In .call_fun_in_pwalign("pairwiseAlignment", ...) :
pairwiseAlignment() has moved from Biostrings to the pwalign package, and is
formally deprecated in Biostrings >= 2.75.1. Please call
pwalign::pairwiseAlignment() to get rid of this warning.
Execution halted
Flavor: r-devel-linux-x86_64-fedora-gcc
Version: 0.2.8
Check: compiled code
Result: NOTE
File 'seqtrie/libs/x64/seqtrie.dll':
Found non-API call to R: 'STRING_PTR'
Compiled code should not call non-API entry points in R.
See 'Writing portable packages' in the 'Writing R Extensions' manual,
and section 'Moving into C API compliance' for issues with the use of
non-API entry points.
Flavor: r-devel-windows-x86_64
Version: 0.2.8
Check: for GNU extensions in Makefiles
Result: NOTE
GNU make is a SystemRequirements.
Flavors: r-patched-linux-x86_64, r-release-linux-x86_64, r-release-macos-arm64, r-release-macos-x86_64, r-release-windows-x86_64, r-oldrel-macos-arm64, r-oldrel-macos-x86_64, r-oldrel-windows-x86_64
Version: 0.2.8
Check: installed package size
Result: NOTE
installed size is 5.9Mb
sub-directories of 1Mb or more:
data 1.1Mb
libs 4.3Mb
Flavors: r-release-macos-arm64, r-release-macos-x86_64, r-oldrel-macos-arm64, r-oldrel-macos-x86_64
Version: 0.2.8
Check: package dependencies
Result: NOTE
Package suggested but not available for checking: ‘pwalign’
Flavors: r-release-macos-x86_64, r-oldrel-macos-arm64, r-oldrel-macos-x86_64, r-oldrel-windows-x86_64