This function can compute the mode for arbitrary iterables, including vectors of floats:
|
# compute mode over arbitrary iterable |
|
function mode(a) |
|
isempty(a) && throw(ArgumentError("mode is not defined for empty collections")) |
|
cnts = Dict{eltype(a),Int}() |
|
# first element |
|
mc = 1 |
|
mv, st = iterate(a) |
|
cnts[mv] = 1 |
|
# find the mode along with table construction |
|
y = iterate(a, st) |
|
while y !== nothing |
|
x, st = y |
|
if haskey(cnts, x) |
|
c = (cnts[x] += 1) |
|
if c > mc |
|
mc = c |
|
mv = x |
|
end |
|
else |
|
cnts[x] = 1 |
|
# in this case: c = 1, and thus c > mc won't happen |
|
end |
|
y = iterate(a, st) |
|
end |
|
return mv |
|
end |
Indeed, it's called to compute the mode of a vector of floats:
julia> @which StatsBase.mode([1.,3.,3.141])
mode(a)
@ StatsBase ~/.julia/packages/StatsBase/s2YJR/src/scalarstats.jl:111
However, it has very high bias and variance compared to the half-sample mode (HSM, see [1]), for example. This can be easily verified with simulations:
Row │ nobs dist method bias variance MAE
│ Int64 Distribu… String Float64 Float64 Float64
─────┼───────────────────────────────────────────────────────────────────────────────────────────
1 │ 1000 Normal{Float64}(μ=0.0, σ=1.0) StatsBase -0.0105188 1.03791 0.818895
2 │ 1000 Normal{Float64}(μ=0.0, σ=1.0) HSM -0.00256086 0.0550354 0.189431
3 │ 1000000 Normal{Float64}(μ=0.0, σ=1.0) StatsBase -0.0242424 1.03698 0.820519
4 │ 1000000 Normal{Float64}(μ=0.0, σ=1.0) HSM -0.00500615 0.00342947 0.0479765
5 │ 1000 Normal{Float64}(μ=-1.0, σ=1.0) StatsBase 0.00682327 0.956165 0.788644
6 │ 1000 Normal{Float64}(μ=-1.0, σ=1.0) HSM -0.00291935 0.0560346 0.189825
7 │ 1000000 Normal{Float64}(μ=-1.0, σ=1.0) StatsBase 0.0167116 0.955203 0.778444
8 │ 1000000 Normal{Float64}(μ=-1.0, σ=1.0) HSM -0.00111594 0.00361161 0.0485056
9 │ 1000 TDist{Float64}(ν=0.1) StatsBase 1.68401e33 2.77662e69 1.68401e33
10 │ 1000 TDist{Float64}(ν=0.1) HSM 0.00730426 0.0190662 0.110146
11 │ 1000000 TDist{Float64}(ν=0.1) StatsBase -1.96736e28 3.87052e59 1.96738e28
12 │ 1000000 TDist{Float64}(ν=0.1) HSM 0.00123069 0.00077329 0.0222707
13 │ 1000 TDist{Float64}(ν=1.5) StatsBase -0.0189519 10.7845 1.62119
14 │ 1000 TDist{Float64}(ν=1.5) HSM -0.000349072 0.0394342 0.158469
15 │ 1000000 TDist{Float64}(ν=1.5) StatsBase 0.0527691 27.5577 1.96788
16 │ 1000000 TDist{Float64}(ν=1.5) HSM 0.00175729 0.00257369 0.0406603
17 │ 1000 Beta{Float64}(α=9.2, β=5.6) StatsBase -0.0223237 0.0146286 0.0988868
18 │ 1000 Beta{Float64}(α=9.2, β=5.6) HSM -0.0015705 0.00102678 0.0260994
19 │ 1000000 Beta{Float64}(α=9.2, β=5.6) StatsBase -0.0175676 0.0144346 0.0976854
20 │ 1000000 Beta{Float64}(α=9.2, β=5.6) HSM -0.000445367 6.02226e-5 0.00628862
- In all cases, the bias of
StatsBase.mode is at least 4 times higher than that of HSM. It's arbitrarily high when the true data-generating process (DGP) has no mean, like for the Student distribution with degrees of freedom less than 1. Note that the mode of any Student distribution is 0 (or equal to the location parameter), independent of the number of degrees of freedom.
- The variance of the
StatsBase.mode estimator can be arbitrarily high as well. In the case of TDist(0.1), it's on the order of $10^{59}$. For TDist(1.5) it's around 30. Meanwhile, the variance of HSM is less than 0.005 in both cases.
- The MAE for
StatsBase.mode is at least 15 times (!) higher than for HSM and can be made arbitrarily high ($10^{28}$ for TDist(0.1) compared to 0.02 of HSM).
Is StatsBase.mode actually meant for computing the mode of arbitrary iterables, including samples from a continuous distribution? Probably not, because it simply returns the most common element in the collection, but samples from continuous distributions are usually unique. Perhaps a separate method is needed to estimate the mode from a sample of floats, like the HSM. Is so, I could contribute a PR.
References
- David R. Bickel, Rudolf Frühwirth, "On a fast, robust estimator of the mode: Comparisons to other robust estimators with applications", Computational Statistics & Data Analysis, Volume 50, Issue 12, 2006, Pages 3500-3530, ISSN 0167-9473, https://doi.org/10.1016/j.csda.2005.07.011.
Code to generate the table
import Pkg
Pkg.activate(temp=true)
Pkg.add(name="StatsBase", version="0.34.5")
Pkg.add(["Distributions", "ThreadsX", "DataFrames"])
Pkg.status()
import ThreadsX, StatsBase
import DataFrames: DataFrame
using Distributions
# Assumes `x` is sorted
function mode_HSM(x::AbstractVector{T}; sorted::Bool=false)::T where T<:Real
if !sorted
x = sort(x) # Allocates a vector of `length(x)` elements
end
# Below code does NOT allocate
start_idx = 1
end_idx = length(x)
n = end_idx - start_idx + 1
while n > 3
N = ceil(Int, n / 2)
min_width = T(Inf)
min_start = start_idx
for i in start_idx:(end_idx - N + 1)
width = x[i + N - 1] - x[i]
if width < min_width
min_width = width
min_start = i
end
end
start_idx = min_start
end_idx = min_start + N - 1
n = end_idx - start_idx + 1
end
# Return the estimated mode
if n == 1
x[start_idx]
elseif n == 2
(x[start_idx] + x[end_idx]) / 2
else
d1 = x[start_idx + 1] - x[start_idx]
d2 = x[start_idx + 2] - x[start_idx + 1]
if d1 < d2
(x[start_idx] + x[start_idx + 1]) / 2
elseif d2 < d1
(x[start_idx + 1] + x[start_idx + 2]) / 2
else
x[start_idx + 1]
end
end
end
function compare(nobs::Integer, dist::ContinuousUnivariateDistribution, nsamples::Integer=1000)
mode_true = Distributions.mode(dist)
the_modes = ThreadsX.map(
_->begin
samples = rand(dist, nobs)
StatsBase.mode(samples), mode_HSM(samples)
end, 1:nsamples
)
DataFrame([
(;
nobs, dist,
method="StatsBase",
bias=mean(m1 for (m1, m2) in the_modes) - mode_true,
variance=var(m1 for (m1, m2) in the_modes),
MAE=mean(abs(m1 - mode_true) for (m1, m2) in the_modes)
),
(;
nobs, dist,
method="HSM",
bias=mean(m2 for (m1, m2) in the_modes) - mode_true,
variance=var(m2 for (m1, m2) in the_modes),
MAE=mean(abs(m2 - mode_true) for (m1, m2) in the_modes)
)
])
end
function compare_append(df_full::Union{Nothing, DataFrame}, nobs, dist, nsamples=1000; quiet::Bool=false)
df = compare(nobs, dist, nsamples)
quiet || display(df)
(df_full == nothing) ? df : vcat(df_full, df)
end
df_full = let
@info "Starting benchmark..."
df_full = compare_append(nothing, 10^3, Normal(0, 1))
df_full = compare_append(df_full, 10^6, Normal(0, 1))
df_full = compare_append(df_full, 10^3, Normal(-1, 1))
df_full = compare_append(df_full, 10^6, Normal(-1, 1))
# Mean and variance may not exist, but the mode is always zero
df_full = compare_append(df_full, 10^3, TDist(0.1)) # Mean doesn't exist
df_full = compare_append(df_full, 10^6, TDist(0.1))
df_full = compare_append(df_full, 10^3, TDist(1.5)) # Variance doesn't exist
df_full = compare_append(df_full, 10^6, TDist(1.5))
# Mode only exists when α, β > 1
df_full = compare_append(df_full, 10^3, Beta(9.2, 5.6))
df_full = compare_append(df_full, 10^6, Beta(9.2, 5.6))
println("\n\n")
display(df_full)
df_full
end
This function can compute the mode for arbitrary iterables, including vectors of floats:
StatsBase.jl/src/scalarstats.jl
Lines 110 to 135 in dfe8945
Indeed, it's called to compute the mode of a vector of floats:
However, it has very high bias and variance compared to the half-sample mode (HSM, see [1]), for example. This can be easily verified with simulations:
StatsBase.modeis at least 4 times higher than that of HSM. It's arbitrarily high when the true data-generating process (DGP) has no mean, like for the Student distribution with degrees of freedom less than 1. Note that the mode of any Student distribution is 0 (or equal to the location parameter), independent of the number of degrees of freedom.StatsBase.modeestimator can be arbitrarily high as well. In the case ofTDist(0.1), it's on the order ofTDist(1.5)it's around 30. Meanwhile, the variance of HSM is less than 0.005 in both cases.StatsBase.modeis at least 15 times (!) higher than for HSM and can be made arbitrarily high (TDist(0.1)compared to 0.02 of HSM).Is
StatsBase.modeactually meant for computing the mode of arbitrary iterables, including samples from a continuous distribution? Probably not, because it simply returns the most common element in the collection, but samples from continuous distributions are usually unique. Perhaps a separate method is needed to estimate the mode from a sample of floats, like the HSM. Is so, I could contribute a PR.References
Code to generate the table