Things may have changed since I did this, but in my experience you have to instantiate an HDF5 object with the expected number of columns and rows, and then you can dump data in. Here's a function I wrote back in the day to make an HDF5-backed SummarizedExperiment
with data I couldn't read in all at once. Different use case, but you can see the broad outlines I would imagine.
makeSE.hdf5 <- function(kgranges, funcgranges, gtexgranges, baselinefile, dir = "my_h5_se",
fnames, startover = TRUE, startwith = NULL){
require("SummarizedExperiment")
require("HDF5Array")
if(!file.exists(dir)) dir.create(dir)
fn <- paste(dir, "assays.h5", sep = "/")
rr <- unlist(as(kgranges, "GRangesList"))
basefiles <- scan(baselinefile[[1]], "c", nlines = 1, sep = "\t")[-(1:5)]
if(!file.exists(fn) || startover){
if(startover) unlink(fn)
h5createFile(fn)
flen <- sum(sapply(kgranges, length))
baselinewidth <- length(basefiles)
## have to add one because we create the union column
fwid <- length(funcgranges) + length(gtexgranges) + baselinewidth + 1
cat("Creating an HDF5 file, dimension", flen, "x", fwid, "\n")
h5createDataset(fn, "assay1", c(flen, fwid), storage.mode = "logical", chunk = c(1000,fwid))
}
if(!startover && is.null(startwith))
stop(paste("If not starting over, remember to supply the original kgranges object with",
"a startwith value that represents the kgranges list item to start with."),
call. = FALSE)
NAMES <- do.call(c, lapply(kgranges, names))
if(!startover){
dontuse <- 1:(startwith - 1)
firstrow <- sum(sapply(kgranges[dontuse], length)) + 1
kgranges <- kgranges[-dontuse]
} else {
firstrow <- 1
}
for(i in seq(along = kgranges)){
mat <- populateMatrix(kgranges[[i]], olaplst)
mat2 <- populateMatrix(kgranges[[i]], gtexgranges, TRUE)
mat3 <- addInBaseline(kgranges[[i]], baselinefile[[i]])
mat <- cbind(mat, mat2, mat3)
cat(paste("Running", unique(as.character(seqnames(kgranges[[i]])))),
paste("Starting at", firstrow),
paste("Ending at", (firstrow+nrow(mat)-1)),sep = "\n")
h5write(mat, fn, "assay1", FALSE, index = list(firstrow:(firstrow+nrow(mat)-1), 1:ncol(mat)))
cat(paste("Finished writing", unique(as.character(seqnames(kgranges[[i]]))), "to disk"), "\n")
firstrow <- firstrow + nrow(mat)
rm(mat)
gc()
H5close()
}
coldat <- DataFrame(Path = c(dirname(fnames), "Internally_generated",
rep(dirname(baselinefls)[1], length(basefiles))),
Source = rep(c("GTEx","LDSC"), c(length(fnames)+1, length(basefiles))),
Filename = c(basename(fnames), "Internally_generated", basefiles))
rownames(coldat) <- coldat$Filename
out <- SummarizedExperiment(assays = HDF5Array("my_h5_se/assays.h5", "assay1"),
colData = coldat,
rowRanges = rr)
names(out) <- NAMES
## Save it and output
out@assays <- SummarizedExperiment:::.shorten_h5_paths(out@assays)
saveRDS(out, file.path(dir, "se.rds"))
## we don't return anything - this function is just to generate, and the
## file can then be opened using loadHDF5SummarizedExperiment
}
Thanks Mike, that was super helpful!