Search
Question: segfault when using data.table package in conjunction with foreach
0
gravatar for Matthew Keller
5.7 years ago by
Matthew Keller100 wrote:
Hi all, I'm trying to use the package read.table within a foreach loop. I'm grabbing 500M rows of data at a time from two different files and then doing an aggregate/tapply like function in read.table after that. I had planned on doing a foreach loop 39 times at once for the 39 files I have, but obviously that won't work until I figure out why the segfault is occurring. The sessionInfo, code, and error are pasted below. If you have any ideas, would love to hear them. (I have no control over the version of R - 2.13.0 - being used). Best Matt SESSION INFO: > sessionInfo() R version 2.13.0 (2011-04-13) Platform: x86_64-unknown-linux-gnu (64-bit) locale: [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=C [6] LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C attached base packages: [1] stats graphics grDevices utils datasets methods base other attached packages: [1] data.table_1.7.10 doMC_1.2.2 multicore_0.1-5 foreach_1.3.2 codetools_0.2-8 iterators_1.0.3 MY CODE: computeAllPairSums <- function(filename, nbindiv,nrows.to.read) { con <- file(filename, open="r") on.exit(close(con)) ans <- matrix(numeric(nbindiv * nbindiv), nrow=nbindiv) chunk <- 0L while (TRUE) { #read.table faster than scan df0 <- read.table(con,col.names=c("ID1", "ID2", "ignored", "sharing"), colClasses=c("integer", "integer", "NULL", "numeric"),nrows=nrows.to.read,comment.char="") DT <- data.table(df0) setkey(DT,ID1,ID2) ss <- DT[,sum(sharing),by="ID1,ID2"] if (nrow(df0) == 0L) break chunk <- chunk + 1L cat("Processing chunk", chunk, "... ") idd <- as.matrix(subset(ss,select=1:2)) newvec <- as.vector(as.matrix(subset(ss,select=3))) ans[idd] <- ans[idd] + newvec cat("OK\n") } ans } require(foreach) require(doMC) registerDoMC(cores=2) num <- 8891 nr <- 500000000L #500 million rows at a time MMM <- foreach(IT = 1:2) %dopar% { require(data.table) if (IT==1){ x <- system.time({computeAllPairSums( paste(GERMLINE,"bc.chr22.q.20.file",sep=''),num,nr)}) } #Run it on regular file PID 6489, 24 gb if (IT==2){ z <- system.time({computeAllPairSums.gz( paste(GERMLINE,"bc.chr22.q.20.gz",sep=''),num,nr)}) } #Run it on gz file PID 6490, 24 gb } MY R OUTPUT/ERROR: MMM <- foreach(IT = 1:2) %dopar% { + require(data.table) + if (IT==1){ x <- system.time({computeAllPairSums( paste(GERMLINE,"bc.chr22.q.20.file",sep=''),num,nr)}) } #Run it on regular file PID 6053, 5.9 gb + if (IT==2){ z <- system.time({computeAllPairSums.gz( paste(GERMLINE,"bc.chr22.q.20.gz",sep=''),num,nr)}) } #Run it on gz file PID 6054, 4 gb + } Loading required package: data.table Loading required package: data.table data.table 1.7.10 For help type: help("data.table") data.table 1.7.10 For help type: help("data.table") *** caught segfault *** address 0x2ae93df90000, cause 'memory not mapped' Traceback: 1: .Call("dogroups", x, xcols, o__, f__, len__, jsub, SDenv, testj, byretn, byval, i, as.integer(icols), i[1, ivars, with = FALSE], if (length(ivars)) paste("i.", ivars, sep = ""), is.na(nomatch), verbose, PACKAGE = "data.table") 2: `[.data.table`(DT, , sum(sharing), by = "ID1,ID2") 3: DT[, sum(sharing), by = "ID1,ID2"] 4: computeAllPairSums(paste(GERMLINE, "bc.chr22.q.20.file", sep = ""), num, nr) 5: system.time({ computeAllPairSums(paste(GERMLINE, "bc.chr22.q.20.file", sep = ""), num, nr)}) 6: eval(expr, envir, enclos) 7: eval(c.expr, envir = args, enclos = envir) 8: doTryCatch(return(expr), name, parentenv, handler) 9: tryCatchOne(expr, names, parentenv, handlers[[1L]]) 10: tryCatchList(expr, classes, parentenv, handlers) 11: tryCatch(eval(c.expr, envir = args, enclos = envir), error = function(e) e) 12: FUN(X[[1L]], ...) 13: lapply(S, FUN, ...) 14: doTryCatch(return(expr), name, parentenv, handler) 15: tryCatchOne(expr, names, parentenv, handlers[[1L]]) 16: tryCatchList(expr, classes, parentenv, handlers) 17: tryCatch(expr, error = function(e) { call <- conditionCall(e) if (!is.null(call)) { if (identical(call[[1L]], quote(doTryCatch))) call <- sys.call(-4L) dcall <- deparse(call)[1L] prefix <- paste("Error in", dcall, ": ") LONG <- 75L msg <- conditionMessage(e) sm <- strsplit(msg, "\n")[[1L]] w <- 14L + nchar(dcall, type = "w") + nchar(sm[1L], type = "w") if is.na(w)) w <- 14L + nchar(dcall, type = "b") + nchar(sm[1L], type = "b") if (w > LONG) prefix <- paste(prefix, "\n ", sep = "") } else prefix <- "Error : " msg <- paste(prefix, conditionMessage(e), "\n", sep = "") .Internal(seterrmessage(msg[1L])) if (!silent && identical(getOption("show.error.messages"), TRUE)) { cat(msg, file = stderr()) .Internal(printDeferredWarnings()) } invisible(structure(msg, class = "try-error"))}) 18: try(lapply(S, FUN, ...), silent = TRUE) 19: sendMaster(try(lapply(S, FUN, ...), silent = TRUE)) 20: FUN(1:2[[1L]], ...) 21: lapply(1:cores, inner.do) 22: mclapply(argsList, FUN, mc.preschedule = preschedule, mc.set.seed = set.seed, mc.silent = silent, mc.cores = cores) 23: e$fun(obj, substitute(ex), parent.frame(), e$data) 24: foreach(IT = 1:2) %dopar% { require(data.table) if (IT == 1) { x <- system.time({ computeAllPairSums(paste(GERMLINE, "bc.chr22.q.20.file", sep = ""), num, nr) }) } if (IT == 2) { z <- system.time({ computeAllPairSums.gz(paste(GERMLINE, "bc.chr22.q.20.gz", sep = ""), num, nr) }) }} Possible actions: 1: abort (with core dump, if enabled) 2: normal R exit 3: exit R without saving workspace 4: exit R saving workspace -- Matthew C Keller Asst. Professor of Psychology University of Colorado at Boulder www.matthewckeller.com
ADD COMMENTlink written 5.7 years ago by Matthew Keller100
Please log in to add an answer.

Help
Access

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.
Powered by Biostar version 2.2.0
Traffic: 305 users visited in the last hour