-
Notifications
You must be signed in to change notification settings - Fork 22
Expand file tree
/
Copy pathffCompress.Rd
More file actions
150 lines (140 loc) · 4.8 KB
/
ffCompress.Rd
File metadata and controls
150 lines (140 loc) · 4.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
\name{ffCompress}
\alias{ffCompress}
\alias{as.data.frame.ffdflabel}
\alias{[.ffdflabel}
\title{Transform Data Frame Into a Compact \code{ff} Object}
\description{
The \code{ff} package implements a wide variety of atomic data types
down to 2 bits, allowing for compact storage of large datasets and
requiring memory usage in R for only those rows and columns of the
dataset that are needed at one time. It is best to create a compact
\code{ffdf} data frame object while initially reading the external
data file, using for example an input \code{.csv} file and specifying
the \code{ff} \code{vmode}s for all the columns. If you can get
enough memory to read the entire dataset into an R \code{data.frame}
you can use \code{ffCompress} after the fact to analyze the data frame
and use the most compact data representation possible. This entails
using single precision for floating point numbers (which can be easily
overridden to use R's usual double precision) and a variety of integer
types depend on the number of bits used by the maximum absolute value
of the variable, whether or not \code{NA}s exist in the data, and
whether or not negative values are possible.
Since \code{ff} does not allow variable labels and units, any such
attributes are stripped off of variables and stored as attributes on the
entire \code{ffdf} object. An \code{as.data.frame} and subscripting
method retrieves these attributes and restores them to individual
variables when needed.
}
\usage{
ffCompress(obj, float=c('single', 'double'), print=FALSE)
\method{as.data.frame}{ffdflabel}(x, \dots)
}
\arguments{
\item{obj}{a data frame}
\item{float}{representation to use for floating point vectors. The
default is single precision (4 bytes, 7 significant digits).
Specify \code{float='double'} to use double precision (8 bytes, 15
significant digits)}
\item{print}{set to \code{TRUE} to get progress output and passed as
the \code{VERBOSE} argument to \code{ff} functions}
\item{x}{an \code{ffdf} object}
\item{\dots}{ignored}
}
\value{
an \code{ffdf} object for \code{ffCompress}, a \code{data.frame} for
\code{as.data.frame}, and either one of these for subscripting. If
subscripting results in a single variable and \code{drop=FALSE} is not
specified, the result is of \code{ff} type.
}
\author{Frank Harrell, Vanderbilt University}
\seealso{\code{\link[Hmisc]{cleanup.import}}}
\examples{
\dontrun{
require(ff)
require(survival)
n <- 1e6
d <- data.frame(x=rnorm(n), y=sample(0:1, n, TRUE),
i=as.Date('2013-01-02'), S=Surv(runif(n)),
z=factor(sample(1:3, n, TRUE), 1:3,
c('elephant','giraffe','dog')))
## Cannot have labels for variables; ff will reject as non-atomic vectors
storage.mode(d$y)
object.size(d)
n * (8 + 4 + 4 + 4)
f <- as.ffdf(d, vmode=c('single', 'quad', 'integer', 'single', 'quad'))
vmode(f)
n * (4 + 0.25 + 4 + 0.25)
object.size(as.data.frame(f))
f[1:10,]
hist(d[,'x'] - f[,'x'], nclass=100)
table(d[,'z'], f[,'z'])
system.time(subset(f, z == 'dog'))
system.time({i <- ffwhich(f, z == 'dog'); f[i,]})
table(subset(f, z == 'dog')[,'z'])
class(subset(f, z == 'dog'))
ffsave(f, file='/tmp/f') # creates /tmp/f.ffData /tmp/f.RData
## To load: ffload('/tmp/f')
d <- upData(d, labels=c(y='Y'), units=c(z='units z'))
f <- ffCompress(d)
vmode(f)
load('ras.rda') # dataset is not available
r <- ffCompress(ras)
vmode(r)
attr(r, 'label')
attr(r, 'units')
all.equal(ras, as.data.frame(r))
dr <- as.data.frame(r)
g <- function(x) names(attributes(x))
nam <- names(dr)
for(i in 1 : ncol(dr)) {
a <- ras[[i]]
b <- dr[[i]]
cat(nam[i], '\n')
cat(g(a), '\n', g(b), '\n')
cat(max(w <- abs(unclass(a) - unclass(b)), na.rm=TRUE), '\n')
if(nam[i] == 'ldl') {
j <- which.max(abs(w))
cat(a[j], b[j], '\n')
}
}
dr <- as.data.frame(r)
xless(contents(dr))
xless(contents(r[1:10,]))
xless(contents(r[,1:10]))
table(r[, 'gender'])
## subset invokes [] so uses method from ffdflabel
m <- subset(r, gender == 'Male')
class(m)
dim(m)
attr(m, 'label')
attributes(m[,'age'])
df <- as.data.frame(m)
class(df$age)
label(df$age)
## But if subset again things are not OK
k <- subset(m, age < 3)
class(k)
contents(k[, 'age', drop=FALSE])
invisible(ffsave(r, file='/tmp/r'))
## w <- read.csv.ffdf(file='/tmp/data.csv', first.rows=10000)
## table(vmode(w))
## From ff manual: vmode definitions
# boolean 1 bit logical without NA
# logical 2 bit logical with NA
# quad 2 bit unsigned integer without NA
# nibble 4 bit unsigned integer without NA
# byte 8 bit signed integer with NA
# ubyte 8 bit unsigned integer without NA
# short 16 bit signed integer with NA
# ushort 16 bit unsigned integer without NA
# integer 32 bit signed integer with NA
# single 32 bit float
# double 64 bit float
# complex 2x64 bit float
# raw 8 bit unsigned char
# character character
}
}
\keyword{attribute}
\keyword{misc}
\keyword{utilities}