Continuing a thread from this github issue, there's a couple of CSV's containing SHARE-seq RNA and ATAC data, and they don't fit into RAM, so I want to use HDF5Array. Some test hdf5 files are successfully made using suggestions from that thread, and I'll post both them and the code to make them below. The problem is when I try to read them.
> rhdf5::h5closeAll()
> withr::with_dir( data_dir, {
> HDF5Array::HDF5Array("rna.test.h5", "RNA")[1:4, 1:4]
> })
> withr::with_dir( data_dir, {
> HDF5Array::HDF5Array("atac.test.h5", "ATAC")[1:4, 1:4]
> })
Both reads yield Error in H5Dopen(gid, name) : HDF5. Dataset. Can't open object.
Occasionally, these read commands work, but most of the time they fail. The exact cause is not obvious but the typical pattern is "work fail fail..." or "fail fail fail ...", never "fail work fail ..." (once you fail, always fail thereafter). Restarting R and deleting and remaking the files does not help. I don't know how to post the files yet but I am working on that. In the mean time, here is the code and session info.
library(Matrix)
library(DelayedArray)
source("~/Desktop/jhu/research/datalake/convert_to_hdf5.R")
data_dir = "~/Desktop/jhu/research/datalake/share_seq/skin"
withr::with_dir( new = data_dir, code = {
csvToTENxMatrix(
input_block_nrow = 5,
group = "RNA",
verbose = T,
input_file = "test_file.tsv.gz",
output_file = "rna.test.h5",
input_dim = c(9, 42948),
force = T,
sep = "\t",
row.names = 1
)
csvToTENxMatrix(
input_block_nrow = 5,
input_file = "test_sparse.txt",
output_file = "atac.test.h5",
input_dim = c(344592, 10),
force = T,
sep = " ",
group = "ATAC",
is_sparse_input = T
)
})
rhdf5::h5closeAll()
withr::with_dir( data_dir, {
HDF5Array::HDF5Array("rna.test.h5", "RNA")[1:4, 1:4]
})
withr::with_dir( data_dir, {
HDF5Array::HDF5Array("atac.test.h5", "ATAC")[1:4, 1:4]
})
And here's the the file that gets sourced above (convert_to_hdf5.R
).
library(DelayedArray)
library(HDF5Array)
#' Convert a csv or matrix market file to an HDF5Array.
#'
#' @param input_dim c(nrows, ncols) if dense.
#' same order as 2nd line if matrix market sparse.
#' @param input_file
#' @param output_file
#' @param group Name of the group for the hdf5 output.
#' @param input_block_nrow How many rows to ingest at a time.
#' @param force Overwrite output file if needed
#' @param verbose
#' @param is_sparse_input If T, assumes matrix market coordinate integer general.
#'
#' As suggested by Herve Pages
#' https://github.com/Bioconductor/HDF5Array/issues/47
#' Also expanded to cover matrix market coordinate integer general format
#' (no handling of matrix market special structure e.g. symmetric)
csvToTENxMatrix <- function(
input_file,
input_dim,
output_file,
group,
input_block_nrow,
force=FALSE,
verbose=FALSE,
is_sparse_input = F,
...)
{
# Prepare output file
if (file.exists(output_file)) {
if (!force){
stop("File exists. Pass force=TRUE to overwrite.")
}
if (unlink(output_file) != 0L){
stop("failed to delete file \"", output_file, "\"")
}
}
input_connection <- gzfile(input_file, "r")
output_dim <- as.integer(input_dim)
# Sparse input is handled one column at a time.
# Dense input is handled one row at a time.
if(!is_sparse_input){
output_dim = rev(output_dim)
}
output_sink <- HDF5Array::TENxRealizationSink(output_dim, filepath=output_file, group=group)
sink_grid <- DelayedArray::colGrid(output_sink, ncol=input_block_nrow)
# Prepare input reader
if(!is_sparse_input){
my_reader = function(input_connection, range_to_read, block_index){
input_block = read.csv(input_connection, nrows=width(range_to_read), header=(block_index==1), ...)
if (ncol(input_block) <= 1L)
stop("No delimiters found. Wrong delimiter?")
input_block
}
} else {
# With sparse-format input, I still can't fit it in my puny RAM, so I need hdf5 and random access.
# But I don't know how many nonzeroes are in the block I want to write next.
# This takes care of that.
my_reader = function(input_connection, range_to_read, block_index){
block = matrix(0, ncol = width(range_to_read), nrow = input_dim[[1]])
if (ncol(block) <= 1L)
stop("No delimiters found. Wrong delimiter?")
is_top = T
stop_loop = F
while(!stop_loop){
lines = readLines(input_connection, n = 1000)
if(length(lines)<1){
break
}
X = read.csv(text = lines, header = is_top & (block_index==1), comment.char = "%", ...)
is_top = F
X[["has_overshot"]] = X[[2]] > end( range_to_read )
if( any(X[["has_overshot"]]) ){
pushBack(data = paste0(lines[X[["has_overshot"]]], collapse = "\n"),
connection = input_connection)
X = X[!X[["has_overshot"]],]
stop_loop = T
}
X[[2]] = X[[2]] - (block_index-1)*input_block_nrow
stopifnot(all(X[[2]]>0))
block[as.matrix(X[1:2])] = X[[3]]
}
t(block)
}
}
## Walk on the grid, and, for each viewport, read a block from the CSV file and
## write it to the h5 file.
for (bid in seq_along(sink_grid)) {
viewport <- sink_grid[[bid]]
if (verbose)
cat("Reading row(s) ", as.character(ranges(viewport)[2]), " from ", input_file, " ... ", sep="")
input_block <- my_reader(input_connection,
range_to_read = ranges(viewport)[2],
block_index = bid)
if (nrow(input_block) < width(viewport)[2] )
stop("Too few rows in input.")
if (verbose)
cat("OK\n")
output_block <- t(as.matrix(input_block))
if (verbose)
cat("Writing column(s) ", as.character(ranges(viewport)[2]), " to ", output_file, " ... ", sep="")
write_block(output_sink, viewport, output_block)
if (verbose)
cat("OK\n")
}
close(output_sink)
close(input_connection)
as(output_sink, "DelayedArray")
}
Sessioninfo:
> sessionInfo()
R version 4.0.0 (2020-04-24)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: elementary OS 5.1.7 Hera
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/openblas/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8
[6] LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] HDF5Array_1.16.1 rhdf5_2.32.4 DelayedArray_0.14.1 IRanges_2.22.2 S4Vectors_0.26.1 BiocGenerics_0.34.0 matrixStats_0.61.0
[8] Matrix_1.3-4
loaded via a namespace (and not attached):
[1] compiler_4.0.0 tools_4.0.0 withr_2.4.2 grid_4.0.0 lattice_0.20-45 Rhdf5lib_1.10.1
I can't see a built-in way to post files here, but they are public on my DropBox.
The "dense input" link doesn't work at the moment.
However I can confirm I get the same error just by downloading your output .h5 file and running the
HDF5Array()
code.