diff --git a/Project.toml b/Project.toml
index b95482b..695cd26 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,20 +1,14 @@
 name = "ParallelUtilities"
 uuid = "fad6cfc8-4f83-11e9-06cc-151124046ad0"
 authors = ["Jishnu Bhattacharya"]
-version = "0.7.7"
+version = "0.8.0"
 
 [deps]
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
-OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
-ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
-Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 
 [compat]
 DataStructures = "0.17, 0.18"
-OffsetArrays = "1"
-ProgressMeter = "1.2"
-Reexport = "0.2, 1.0"
 julia = "1.2"
 
 [extras]
diff --git a/README.md b/README.md
index 1bcf32a..ef7d08f 100644
--- a/README.md
+++ b/README.md
@@ -7,354 +7,36 @@
 
 Parallel mapreduce and other helpful functions for HPC, meant primarily for embarassingly parallel operations that often require one to split up a list of tasks into subsections that may be processed on individual cores.
 
+Note: This package deals with distributed (multi-core) parallelism, and at this moment it has not been tested alongside multi-threading.
+
 # Installation
 
 Install the package using
 
 ```julia
 pkg> add ParallelUtilities
-julia> using ParallelUtilities 
-```
-# Quick start
-
-```julia
-julia> addprocs(2)
-2-element Array{Int64,1}:
- 2
- 3
-
-julia> @everywhere using ParallelUtilities
-
-julia> pmapreduce(x -> ones(2) .* myid(), x -> hcat(x...), 1:nworkers())
-2×2 Array{Float64,2}:
- 2.0  3.0
- 2.0  3.0
-
-julia> pmapreduce_commutative(x -> ones(2) .* myid(), sum, 1:nworkers())
-2-element Array{Float64,1}:
- 5.0
- 5.0
-
-julia> pmapsum(x -> ones(2) .* myid(), 1:nworkers())
-2-element Array{Float64,1}:
- 5.0
- 5.0
-```
-
-# Performance
-
-The `pmapreduce`-related functions are expected to be more performant than `@distributed` for loops. As an example, running the following on a Slurm cluster using 2 nodes with 28 cores on each leads to
-
-```julia
-julia> @time @distributed (+) for i=1:nworkers()
-           ones(10_000, 1_000)
-       end;
- 22.355047 seconds (7.05 M allocations: 8.451 GiB, 6.73% gc time)
-
-julia> @time pmapsum(x -> ones(10_000, 1_000), 1:nworkers());
-  2.672838 seconds (52.83 k allocations: 78.295 MiB, 0.53% gc time)
-```
-
-The difference becomes more apparent as larger data needs to be communicated across workers. This is because `ParallelUtilities.pmapreduce*` perform local reductions on each node before communicating across nodes.
-
-# Usage
-
-The package splits up a collection of ranges into subparts of roughly equal length, so that all the cores are approximately equally loaded. This is best understood using an example: let's say that we have a function `f` that is defined as   
-
-```julia
-julia> @everywhere begin 
-       f(x,y,z) = x+y+z
-       end
-```
-
-where each parameter takes up values in a range, and we would like to sample the entire parameter space. As an example, we choose the ranges to be 
-
-```julia
-julia> xrange, yrange, zrange = 1:3, 2:4, 3:6 # ranges should be strictly increasing
-```
-
-There are a total of 36 possible `(x,y,z)` combinations possible given these ranges. Let's say that we would like to split the evaluation of the function over 10 processors. We describe the simple way to evaluate this and then explain how this is achieved.
-
-The set of parameters may be split up using the function `ProductSplit`. In this example each of the 10 processors receive a chunk as listed below
-
-```julia
-julia> [collect(ProductSplit((xrange,yrange,zrange),10,i)) for i=1:10]
-10-element Array{Array{Tuple{Int64,Int64,Int64},1},1}:
- [(1, 2, 3), (2, 2, 3), (3, 2, 3), (1, 3, 3)]
- [(2, 3, 3), (3, 3, 3), (1, 4, 3), (2, 4, 3)]
- [(3, 4, 3), (1, 2, 4), (2, 2, 4), (3, 2, 4)]
- [(1, 3, 4), (2, 3, 4), (3, 3, 4), (1, 4, 4)]
- [(2, 4, 4), (3, 4, 4), (1, 2, 5), (2, 2, 5)]
- [(3, 2, 5), (1, 3, 5), (2, 3, 5), (3, 3, 5)]
- [(1, 4, 5), (2, 4, 5), (3, 4, 5)]           
- [(1, 2, 6), (2, 2, 6), (3, 2, 6)]           
- [(1, 3, 6), (2, 3, 6), (3, 3, 6)]           
- [(1, 4, 6), (2, 4, 6), (3, 4, 6)] 
-```
-
-The first six processors receive 4 tuples of parameters each and the final four receive 3 each. This is the splitting used by the various functions described next.
-
-## pmap-related functions
-
-The package provides versions of `pmap` with an optional reduction. These differ from the one provided by `Distributed` in a few key aspects: firstly, the iterator product of the argument is what is passed to the function and not the arguments by elementwise, so the i-th task will be `Iterators.product(args...)[i]` and not `[x[i] for x in args]`. Specifically the second set of parameters in the example above will be `(2,2,3)` and not `(2,3,4)`.
-
-Secondly, the iterator is passed to the function in batches and not elementwise, and it is left to the function to iterate over the collection. Thirdly, the tasks are passed on to processors sorted by rank, so the first task is passed to the first processor and the last to the last active worker. The tasks are also approximately evenly distributed across processors. The exported function `pmapbatch_elementwise` passes the elements to the function one-by-one as splatted tuples. This produces the same result as `pmap` for a single range as the argument.
-
-### pmapbatch and pmapbatch_elementwise
-
-As an example we demonstrate how to evaluate the function `f` for the ranges of parameters listed above:
-
-```julia
-julia> p = pmapbatch_elementwise(f, (xrange,yrange,zrange));
-
-julia> Tuple(p)
-(6, 7, 8, 7, 8, 9, 8, 9, 10, 7, 8, 9, 8, 9, 10, 9, 10, 11, 8, 9, 10, 9, 10, 11, 10, 11, 12, 9, 10, 11, 10, 11, 12, 11, 12, 13)
-```
-
-There is also a function `pmapbatch` that deals with batches of parameters that are passed to each processor, and `pmap_elementwise` calls this function under the hood to process the parameters one by one. We may use this directly as well if we need the entire batch for some reason (eg. reading values off a disk, which needs to be done once for the entire set and not for every parameter). As an example we demonstrate how to obtain the same result as above using `pmapbatch`:
-
-```julia
-julia> p = pmapbatch(x->[f(i...) for i in x], (xrange,yrange,zrange));
-
-julia> Tuple(p)
-(6, 7, 8, 7, 8, 9, 8, 9, 10, 7, 8, 9, 8, 9, 10, 9, 10, 11, 8, 9, 10, 9, 10, 11, 10, 11, 12, 9, 10, 11, 10, 11, 12, 11, 12, 13)
-```
-
-### pmapsum and pmapreduce
-
-Often a parallel execution is followed by a reduction (eg. a sum over the results). A reduction may be commutative (in which case the order of results do not matter), or non-commutative (in which the order does matter). There are two functions that are exported that carry out these tasks: `pmapreduce_commutative` and `pmapreduce`, where the former does not preserve ordering and the latter does. For convenience, the package also provides the function `pmapsum` that chooses `sum` as the reduction operator. The map-reduce operation is similar in many ways to the distributed `for` loop provided by julia, but the main difference is that the reduction operation is not binary for the functions in this package (eg. we need `sum` and not `(+)`to add the results). There is also the difference as above that the function gets the parameters in batches, with functions having the suffix `_elementwise` taking on parameters individually as splatted `Tuple`s. The function `pmapreduce` does not take on parameters elementwise at this point, although this might be implemented in the future.
-
-As an example, to sum up a list of numbers in parallel we may call
-```julia
-julia> pmapsum_elementwise(identity, 1:1000)
-500500
+julia> using ParallelUtilities
 ```
 
-Here the mapped function is taken to by `identity` which just returns its argument. To sum the squares of the numbers in a list we may use 
-
-```julia
-julia> pmapsum_elementwise(x -> x^2, 1:1000)
-333833500
-```
-
-We may choose an arbitrary reduction operator in the function `pmapreduce` and `pmapreduce_commutative`, and the elementwise function `pmapreduce_commutative_elementwise`. The reductions are carried out as a binary tree across all workers.
-
-```julia
-# Compute 1^2 * 2^2 * 3^2 in parallel
-julia> pmapreduce_commutative_elementwise(x -> x^2, prod, 1:3)
-36
-```
-
-The function `pmapreduce` sorts the results obtained from each processor, so it is useful for concatenations.
-
-```julia
-julia> workers()
-2-element Array{Int64,1}:
- 2
- 3
-
-# The signature is pmapreduce(fmap, freduce, range_or_tuple_of_ranges)
-julia> pmapreduce(x -> ones(2).*myid(), x -> hcat(x...), 1:nworkers())
-2×2 Array{Float64,2}:
- 2.0  3.0
- 2.0  3.0
-```
-
-The functions `pmapreduce` produces the same result as `pmapreduce_commutative` if the reduction operator is commutative (ie. the order of results received from the children workers does not matter).
-
-The function `pmapsum` sets the reduction function to `sum`.
-
-```julia
-julia> sum(workers())
-5
-
-# We compute ones(2).*sum(workers()) in parallel
-julia> pmapsum(x -> ones(2).*myid(), 1:nworkers())
-2-element Array{Float64,1}:
- 5.0
- 5.0
-```
-
-It is possible to specify the return types of the map and reduce operations in these functions. To specify the return types use the following variants:
-
-```julia
-# Signature is pmapreduce(fmap, Tmap, freduce, Treduce, range_or_tuple_of_ranges)
-julia> pmapreduce(x -> ones(2).*myid(), Vector{Float64}, x -> hcat(x...), Matrix{Float64}, 1:nworkers())
-2×2 Array{Float64,2}:
- 2.0  3.0
- 2.0  3.0
-
-# Signature is pmapsum(fmap, Tmap, range_or_tuple_of_ranges)
-julia> pmapsum(x -> ones(2).*myid(), Vector{Float64}, 1:nworkers())
-2-element Array{Float64,1}:
- 5.0
- 5.0
-```
-
-Specifying the types would lead to a type coercion if possible, or an error if a conversion is not possible. This might help in asserting the correctness of the result obtained. For example:
-
-```julia
-# The result is converted from Vector{Float64} to Vector{Int}. 
-# Conversion works as the numbers are integers
-julia> pmapsum(x -> ones(2).*myid(), Vector{Int}, 1:nworkers())
-2-element Array{Int64,1}:
- 5
- 5
-
-# Conversion fails here as the numbers aren't integers
-julia> pmapsum(x -> rand(2), Vector{Int}, 1:nworkers())
-ERROR: On worker 2:
-InexactError: Int64(0.7742577217010362)
-```
-
-### Progress bar
-
-The progress of the map-reduce operation might be tracked by setting the keyword argument `showprogress` to true. This might be useful in case certain workers have a heavier load than others.
-
-```julia
-# Running on 8 workers, artificially induce load using sleep
-julia> pmapreduce(x -> (sleep(myid()); myid()), x -> hcat(x...), 1:nworkers(), showprogress=true)
-Progress in pmapreduce : 100%|██████████████████████████████████████████████████| Time: 0:00:09
-1×8 Array{Int64,2}:
- 2  3  4  5  6  7  8  9
-
-julia> pmapreduce(x -> (sleep(myid()); myid()), x -> hcat(x...), 1:nworkers(), showprogress=true, progressdesc="Progress : ")
-Progress : 100%|████████████████████████████████████████████████████████████████| Time: 0:00:09
-1×8 Array{Int64,2}:
- 2  3  4  5  6  7  8  9
-```
-
-Note that this does not track the progress of the individual maps, it merely tracks how many are completed. The progress of the individual maps may be tracked by explicitly passing a `RemoteChannel` to the mapping function and pushing the progress status to it from the workers.
-
-### Why two mapreduce functions?
-
-The two separate functions `pmapreduce` and `pmapreduce_commutative` exist for historical reasons. They use different binary tree structures for reduction. The commutative one might be removed in the future in favour of `pmapreduce`.
-
-## ProductSplit
-
-In the above examples we have talked about the tasks being distributed approximately equally among the workers without going into details about the distribution, which is what we describe here. The package provides an iterator `ProductSplit` that lists that ranges of parameters that would be passed on to each core. This may equivalently be achieved using an
-
-```Iterators.Take{Iterators.Drop{Iterators.ProductIterator}}```
-
-with appropriately chosen parameters, and in many ways a `ProductSplit` behaves similarly. However a `ProductSplit` supports several extra features such as `O(1)` indexing, which eliminates the need to actually iterate over it in many scenarios.
-
-The signature of the constructor is 
-
-```julia 
-ProductSplit(tuple_of_ranges, number_of_processors, processor_rank)
-```
-
-where `processor_rank` takes up values in `1:number_of_processors`. Note that this is different from MPI where the rank starts from 0. For example, we check the tasks that are passed on to the processor number 4:
-
-```julia
-julia> ps = ProductSplit((xrange, yrange, zrange), 10, 4);
-
-julia> collect(ps)
-4-element Array{Tuple{Int64,Int64,Int64},1}:
- (1, 3, 4)
- (2, 3, 4)
- (3, 3, 4)
- (1, 4, 4)
-```
-
-where the object loops over values of `(x,y,z)`, and the values are sorted in reverse lexicographic order (the last index increases the slowest while the first index increases the fastest). The ranges roll over as expected. The tasks are evenly distributed with the remainders being split among the first few processors. In this example the first six processors receive 4 tasks each and the last four receive 3 each. We can see this by evaluating the length of the `ProductSplit` operator on each processor
-
-```julia
-julia> Tuple(length(ProductSplit((xrange,yrange,zrange), 10, i)) for i=1:10)
-(4, 4, 4, 4, 4, 4, 3, 3, 3, 3)
-```
-
-### Indexing
-
-The iterator supports fast indexing
-```julia
-julia> ps[3]
-(3, 3, 4)
-
-julia> @btime $ps[3]
-  9.493 ns (0 allocations: 0 bytes)
-(3, 3, 4)
-```
-
-This is useful if we have a large number of parameters to analyze on each processor.
-
-```julia
-julia> xrange_long,yrange_long,zrange_long = 1:3000,1:3000,1:3000
-(1:3000, 1:3000, 1:3000)
-
-julia> params_long = (xrange_long,yrange_long,zrange_long);
-
-julia> ps = ProductSplit(params_long, 10, 3)
-2700000000-element ProductSplit((1:3000, 1:3000, 1:3000), 10, 3)
-[(1, 1, 601), ... , (3000, 3000, 900)]
-
-# Evaluate length using random ranges to avoid compiler optimizations
-julia> @btime length(p) setup = (n = rand(3000:4000); p = ProductSplit((1:n,1:n,1:n), 200, 2));
-  2.674 ns (0 allocations: 0 bytes)
-
-julia> @btime $ps_long[1000000] # also fast, does not iterate
-  32.530 ns (0 allocations: 0 bytes)
-(1000, 334, 901)
-
-julia> @btime first($ps_long)
-  31.854 ns (0 allocations: 0 bytes)
-(1, 1, 901)
-
-julia> @btime last($ps_long)
-  31.603 ns (0 allocations: 0 bytes)
-(3000, 3000, 1200)
-```
+# Quick start
 
-We may evaluate whether or not a value exists in the list and its index in `O(1)` time.
+Just replace `mapreduce` by `pmapreduce` in your code and things should work the same.
 
 ```julia
-julia> val = (3,3,4)
-(3, 3, 4)
-
-julia> val in ps
-true
-
-julia> localindex(ps, val)
-3
-
-julia> val = (10,2,901);
+julia> @everywhere f(x) = (sleep(1); x^2); # some expensive calculation
 
-julia> @btime $val in $ps_long
-  50.183 ns (0 allocations: 0 bytes)
-true
+julia> nworkers()
+2
 
-julia> @btime localindex($ps_long, $val)
-  104.513 ns (0 allocations: 0 bytes)
-3010
-```
-
-Another useful function is `whichproc` that returns the rank of the processor a specific set of parameters will be on, given the total number of processors. This is also computed using a binary search.
-
-```julia
-julia> whichproc(params_long, val, 10)
-4
+julia> @time mapreduce(f, +, 1:10) # Serial
+ 10.021436 seconds (40 allocations: 1.250 KiB)
+385
 
-julia> @btime whichproc($params_long, $val, 10);
-  353.706 ns (0 allocations: 0 bytes)
+julia> @time pmapreduce(f, +, 1:10) # Parallel
+  5.137051 seconds (863 allocations: 39.531 KiB)
+385
 ```
 
-### Extrema
-
-We may compute the ranges of each variable on any processor in `O(1)` time. 
-
-```julia
-julia> extrema(ps, dim = 2) # extrema of the second parameter on this processor
-(3, 4)
-
-julia> Tuple(extrema(ps, dim = i) for i in 1:3)
-((1, 3), (3, 4), (4, 4))
-
-# Minimum and maximum work similarly
-
-julia> (minimum(ps, dim = 2), maximum(ps, dim = 2))
-(3, 4)
+# Usage
 
-julia> @btime extrema($ps_long, dim=2)
-  52.813 ns (0 allocations: 0 bytes)
-(1, 3000)
-```
+See [the documentation](https://jishnub.github.io/ParallelUtilities.jl/stable) for examples and the API.
diff --git a/docs/make.jl b/docs/make.jl
index 6239394..dbaccbc 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -14,7 +14,10 @@ makedocs(;
         assets=String[],
     ),
     pages=[
-        "API" => "index.md",
+        "ParallelUtilities" => "index.md",
+        "Mapreduce" => "pmapreduce.md",
+        "ClusterQueryUtils" => "clusterquery.md",
+        "Reference" => "api.md",
     ],
 )
 
diff --git a/docs/src/api.md b/docs/src/api.md
new file mode 100644
index 0000000..d5870da
--- /dev/null
+++ b/docs/src/api.md
@@ -0,0 +1,5 @@
+# ParallelUtilities.jl
+
+```@autodocs
+Modules = [ParallelUtilities]
+```
diff --git a/docs/src/clusterquery.md b/docs/src/clusterquery.md
new file mode 100644
index 0000000..21652c9
--- /dev/null
+++ b/docs/src/clusterquery.md
@@ -0,0 +1,22 @@
+```@meta
+DocTestSetup  = quote
+    using ParallelUtilities
+    using ParallelUtilities.ClusterQueryUtils
+end
+```
+
+# Cluster Query Utilities
+
+These are a collection of helper functions that are used in `ParallelUtilities`, but may be used independently as well to obtain information about the cluster on which codes are being run.
+
+To use these functions run
+
+```jldoctest cqu
+julia> using ParallelUtilities.ClusterQueryUtils
+```
+
+The functions defined in this module are:
+
+```@autodocs
+Modules = [ParallelUtilities.ClusterQueryUtils]
+```
diff --git a/docs/src/index.md b/docs/src/index.md
index 9c3a401..239a9f4 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -1,9 +1,108 @@
 ```@meta
-CurrentModule = ParallelUtilities
+DocTestSetup  = quote
+    using ParallelUtilities
+end
 ```
 
 # ParallelUtilities.jl
 
-```@autodocs
-Modules = [ParallelUtilities, ParallelUtilities.ClusterQueryUtils]
-```
\ No newline at end of file
+The `ParallelUtilities` module defines certain functions that are useful in a parallel `mapreduce` operation, with particular focus on HPC systems. The approach is similar to a `@distributed (op) for` loop, where the entire section of iterators is split evenly across workers and reduced locally, followed by a global reduction. The operation is not load-balanced at present, and does not support retry on error.
+
+# Performance
+
+The `pmapreduce`-related functions are expected to be more performant than `@distributed` for loops. As an example, running the following on a Slurm cluster using 2 nodes with 28 cores on each leads to
+
+```julia
+julia> using Distributed
+
+julia> using ParallelUtilities
+
+julia> @everywhere f(x) = ones(10_000, 1_000);
+
+julia> A = @time @distributed (+) for i=1:nworkers()
+                f(i)
+            end;
+ 22.637764 seconds (3.35 M allocations: 8.383 GiB, 16.50% gc time, 0.09% compilation time)
+
+julia> B = @time pmapreduce(f, +, 1:nworkers());
+  2.170926 seconds (20.47 k allocations: 77.117 MiB)
+
+julia> A == B
+true
+```
+
+The difference increases with the size of data as well as the number of workers. This is because the `pmapreduce*` functions defined in this package perform local reductions before communicating data across nodes. Note that in this case the same operation may be carried out elementwise to obtain better performance.
+
+```julia
+julia> @everywhere elsum(x,y) = x .+= y;
+
+julia> A = @time @distributed (elsum) for i=1:nworkers()
+               f(i)
+           end;
+ 20.537353 seconds (4.74 M allocations: 4.688 GiB, 2.56% gc time, 1.26% compilation time)
+
+julia> B = @time pmapreduce(f, elsum, 1:nworkers());
+  1.791662 seconds (20.50 k allocations: 77.134 MiB)
+```
+
+A similar evaluation on 560 cores (20 nodes) takes
+
+```julia
+julia> @time for i = 1:10; pmapreduce(f, +, 1:nworkers()); end
+145.963834 seconds (2.53 M allocations: 856.693 MiB, 0.12% gc time)
+
+julia> @time for i = 1:10; pmapreduce(f, elsum, 1:nworkers()); end
+133.810309 seconds (2.53 M allocations: 856.843 MiB, 0.13% gc time)
+```
+
+An example of a mapreduce operation involving large arrays (comparable to the memory allocated to each core) evaluated on 56 cores is
+
+```julia
+julia> @everywhere f(x) = ones(12_000, 20_000);
+
+julia> @time ParallelUtilities.pmapreduce(f, elsum, 1:nworkers());
+ 36.824788 seconds (26.40 k allocations: 1.789 GiB, 0.05% gc time)
+```
+
+# Comparison with other parallel mapreduce packages
+
+Other packages that perform parallel mapreduce are [`ParallelMapReduce`](https://github.com/hcarlsso/ParallelMapReduce.jl) and [`Transducers`](https://github.com/JuliaFolds/Transducers.jl). The latter provides a `foldxd` function that performs an associative distributed `mapfold`. The performances of these functions compared to this package (measured on 1 node with 28 cores) are listed below:
+
+```julia
+julia> @everywhere f(x) = ones(10_000, 10_000);
+
+julia> A = @time ParallelUtilities.pmapreduce(f, +, 1:nworkers());
+ 10.105696 seconds (14.03 k allocations: 763.511 MiB)
+
+julia> B = @time ParallelMapReduce.pmapreduce(f, +, 1:nworkers(), algorithm = :reduction_local);
+ 30.955381 seconds (231.93 k allocations: 41.200 GiB, 7.63% gc time, 0.23% compilation time)
+
+julia> C = @time Transducers.foldxd(+, 1:nworkers() |> Transducers.Map(f));
+ 30.154166 seconds (655.40 k allocations: 41.015 GiB, 8.65% gc time, 1.03% compilation time)
+
+julia> A == B == C
+true
+```
+
+Note that at present the performances of the `pmapreduce*` functions defined in this package are not comparable to equivalent MPI implementations. For example, an MPI mapreduce operation using [`MPIMapReduce.jl`](https://github.com/jishnub/MPIMapReduce.jl) computes an inplace sum over `10_000 x 10_000` matrices on each core in
+
+```julia
+3.413968 seconds (3.14 M allocations: 1.675 GiB, 2.99% gc time)
+```
+
+whereas this package computes it in
+```julia
+julia> @time ParallelUtilities.pmapreduce(f, elsum, 1:nworkers());
+  7.264023 seconds (12.46 k allocations: 763.450 MiB, 1.69% gc time)
+```
+
+This performance gap might reduce in the future.
+
+!!! note
+    The timings have all been measured on Julia 1.6 on an HPC cluster that has nodes with with 2 Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz CPUs ("Broadwell", 14 cores/socket, 28 cores/node). They are also measured for subsequent runs after an initial precompilation step. The exact evaluation time might also vary depending on the cluster load.
+
+# Known issues
+
+1. This package currently does not implement a specialized `mapreduce` for arrays, so the behavior might differ for specialized array argument types (eg. `DistributedArray`s). This might change in the future.
+
+2. This package deals with distributed (multi-core) parallelism, and at this moment it has not been tested alongside multi-threading.
diff --git a/docs/src/pmapreduce.md b/docs/src/pmapreduce.md
new file mode 100644
index 0000000..d4329be
--- /dev/null
+++ b/docs/src/pmapreduce.md
@@ -0,0 +1,297 @@
+```@meta
+DocTestSetup  = quote
+    using ParallelUtilities
+end
+```
+
+# Parallel mapreduce
+
+There are two modes of evaluating a parallel mapreduce that vary only in the arguments that the mapping function accepts.
+
+1. Iterated zip, where one element from the zipped iterators is splatted and passed as arguments to the mapping function. In this case the function must accept as many arguments as the number of iterators passed to mapreduce. This is analogous to a serial `mapreduce`
+
+2. Non-iterated product, in which case the iterator product of the arguments is distributed evenly across the workers. The mapping function in this case should accept one argument that is a collection of `Tuple`s of values. It may iterate over the argument to obtain the individual `Tuple`s.
+
+Each process involved in a `pmapreduce` operation carries out a local `mapreduce`, followed by a reduction across processes. The reduction is carried out in the form of a binary tree. The reduction happens in three stages:
+
+1. A local reduction as a part of `mapreduce`
+2. A reduction on the host across the workers on the same host. Typically on an HPC system there is an independent reduction on each node across the processes on that node.
+3. A global reduction across hosts.
+
+The reduction operator is assumed to be associative, and reproducibility of floating-point operations is not guaranteed. For associative reductions look into various `mapfold*` methods provided by other packages, such as [`Transducers`](https://github.com/JuliaFolds/Transducers.jl). The reduction operator is not assumed to be commutative.
+
+A `pmapreduce` might only benefit in performance if the mapping function runs for longer than the communication overhead across processes, or if each process has dedicated memory and returns large arrays that may not be collectively stored on one process.
+
+## Iterated Zip
+
+The syntax for a parallel map-reduce operation is quite similar to the serial `mapreduce`, with the replacement of `mapreduce` by `pmapreduce`.
+
+Serial:
+
+```julia
+julia> mapreduce(x -> x^2, +, 1:100_000)
+333338333350000
+```
+
+Parallel:
+
+```julia
+julia> pmapreduce(x -> x^2, +, 1:100_000)
+333338333350000
+```
+
+We may check that parallel evaluation helps in performance for a long-running process.
+
+```julia
+julia> nworkers()
+2
+
+julia> @time mapreduce(x -> (sleep(1); x^2), +, 1:6);
+  6.079191 seconds (54.18 k allocations: 3.237 MiB, 1.10% compilation time)
+
+julia> @time pmapreduce(x -> (sleep(1); x^2), +, 1:6);
+  3.365979 seconds (91.57 k allocations: 5.473 MiB, 0.87% compilation time)
+```
+
+## Non-iterated product
+
+The second mode of usage is similar to MPI, where each process evaluates the same function once for different arguments. This is called using
+
+```julia
+pmapreduce_productsplit(f, op, iterators...)
+```
+
+In this function, the iterator product of the argument `iterators` is split evenly across the workers, and
+the function `f` on each process receives one such section according to its rank. The argument is an iterator similar to an iterator product, and looping over it would produce Tuples `(iterators[1][i], iterators[2][i], ...)` where the index `i` depends on the rank of the worker as well as the local loop index.
+
+As an example, we run this with 2 workers:
+
+```julia
+julia> pmapreduce_productsplit(ps -> (@show collect(ps)), vcat, 1:4)
+      From worker 2:    collect(ps) = [(1,), (2,)]
+      From worker 3:    collect(ps) = [(3,), (4,)]
+4-element Vector{Tuple{Int64}}:
+ (1,)
+ (2,)
+ (3,)
+ (4,)
+
+julia> pmapreduce_productsplit(ps -> (@show collect(ps)), vcat, 1:3, 1:2)
+      From worker 2:    collect(ps) = [(1, 1), (2, 1), (3, 1)]
+      From worker 3:    collect(ps) = [(1, 2), (2, 2), (3, 2)]
+6-element Vector{Tuple{Int64, Int64}}:
+ (1, 1)
+ (2, 1)
+ (3, 1)
+ (1, 2)
+ (2, 2)
+ (3, 2)
+```
+
+Note that in each case the mapping function receives the entire collection of arguments in one go, unlike a standard `mapreduce` where the function receives the arguments individually. This is chosen so that the function may perform any one-time compute-intensive task for the entire range before looping over the argument values.
+
+Each process might return one or more values that are subsequently reduced in parallel.
+
+!!! note
+    At present the `iterators` passed as arguments to `pmapreduce_productsplit` may only be strictly increasing ranges. This might be relaxed in the future.
+
+The argument `ps` passed on to each worker is a [`ParallelUtilities.ProductSplit`](@ref) object. This has several methods defined for it that might aid in evaluating the mapping function locally.
+
+### ProductSplit
+
+A `ProductSplit` object `ps` holds the section of the iterator product that is assigned to the worker. It also encloses the worker rank and the size of the worker pool, similar to MPI's `Comm_rank` and `Comm_size`. These may be accessed as `workerrank(ps)` and `nworkers(ps)`. Unlike MPI though, the rank goes from `1` to `np`. An example where the worker rank is used (on 2 workers) is
+
+```julia
+julia> pmapreduce_productsplit(ps -> ones(2) * workerrank(ps), hcat, 1:nworkers())
+2×2 Matrix{Float64}:
+ 1.0  2.0
+ 1.0  2.0
+```
+
+The way to construct a `ProductSplit` object is `ParallelUtilities.ProductSplit(tuple_of_iterators, nworkers, worker_rank)`
+
+```jldoctest productsplit; setup=:(using ParallelUtilities)
+julia> ps = ParallelUtilities.ProductSplit((1:2, 3:4), 2, 1)
+2-element ProductSplit [(1, 3), ... , (2, 3)]
+
+julia> ps |> collect
+2-element Vector{Tuple{Int64, Int64}}:
+ (1, 3)
+ (2, 3)
+```
+
+A `ProductSplit` that wraps `AbstractUnitRange`s has several efficient functions defined for it, such as `length`, `minimumelement`, `maximumelement` and `getindex`, each of which returns in `O(1)` without iterating over the object.
+
+```jldoctest productsplit
+julia> ps[1]
+(1, 3)
+```
+
+The function `maximumelement`, `minimumelement` and `extremaelement` treat the `ProductSplit` object as a linear view of an `n`-dimensional iterator product. These functions look through the elements in the `dim`-th dimension of the iterator product, and if possible, return the corresponding extremal element in `O(1)` time. Similarly, for a `ProductSplit` object that wraps `AbstractUnitRange`s, it's possible to know if a value is contained in the iterator in `O(1)` time.
+
+```julia productsplit
+julia> ps = ParallelUtilities.ProductSplit((1:100_000, 1:100_000, 1:100_000), 25000, 1500)
+40000000000-element ProductSplit [(1, 1, 5997), ... , (100000, 100000, 6000)]
+
+julia> @btime (3,3,5998) in $ps
+  111.399 ns (0 allocations: 0 bytes)
+true
+
+julia> @btime ParallelUtilities.maximumelement($ps, dims = 1)
+  76.534 ns (0 allocations: 0 bytes)
+100000
+
+julia> @btime ParallelUtilities.minimumelement($ps, dims = 2)
+  73.724 ns (0 allocations: 0 bytes)
+1
+
+julia> @btime ParallelUtilities.extremaelement($ps, dims = 2)
+  76.332 ns (0 allocations: 0 bytes)
+(1, 100000)
+```
+
+The number of unique elements along a particular dimension may be obtained as
+
+```julia productsplit
+julia> @btime ParallelUtilities.nelements($ps, dims = 3)
+  118.441 ns (0 allocations: 0 bytes)
+4
+```
+
+It's also possible to drop the leading dimension of a `ProductSplit` that wraps `AbstractUnitRange`s to obtain an analogous operator that contains the unique elements along the remaining dimension. This is achieved using `ParallelUtilities.dropleading`.
+
+```jldoctest productsplit
+julia> ps = ParallelUtilities.ProductSplit((1:3, 1:3, 1:2), 4, 2)
+5-element ProductSplit [(3, 2, 1), ... , (1, 1, 2)]
+
+julia> collect(ps)
+5-element Vector{Tuple{Int64, Int64, Int64}}:
+ (3, 2, 1)
+ (1, 3, 1)
+ (2, 3, 1)
+ (3, 3, 1)
+ (1, 1, 2)
+
+julia> ps2 = ParallelUtilities.dropleading(ps)
+3-element ProductSection [(2, 1), ... , (1, 2)]
+
+julia> collect(ps2)
+3-element Vector{Tuple{Int64, Int64}}:
+ (2, 1)
+ (3, 1)
+ (1, 2)
+```
+
+The process may be repeated multiple times:
+
+```jldoctest productsplit
+julia> collect(ParallelUtilities.dropleading(ps2))
+2-element Vector{Tuple{Int64}}:
+ (1,)
+ (2,)
+```
+
+# Reduction Operators
+
+Any standard Julia reduction operator may be passed to `pmapreduce`. Aside from this, this package defines certain operators that may be used as well in a reduction.
+
+## Broadcasted elementwise operators
+
+The general way to construct an elementwise operator using this package is using [`ParallelUtilities.BroadcastFunction`](@ref).
+
+For example, a broadcasted sum operator may be constructed using
+```jldoctest
+julia> ParallelUtilities.BroadcastFunction(+);
+```
+
+The function call `ParallelUtilities.BroadcastFunction(op)(x, y)` perform the fused elementwise operation `op.(x, y)`.
+
+!!! note "Julia 1.6 and above"
+    Julia versions above `v"1.6"` provide a function `Base.BroadcastFunction` which is equivalent to `ParallelUtilities.BroadcastFunction`.
+
+# Inplace assignment
+
+The function [`ParallelUtilities.broadcastinplace`](@ref) may be used to construct a binary operator that broadcasts a function over its arguments and stores the result inplace in one of the arguments. This is particularly useful if the results in intermediate evaluations are not important, as this cuts down on allocations in the reduction.
+
+Several operators for common functions are pre-defined for convenience.
+
+1. [`ParallelUtilities.elementwisesum!`](@ref)
+2. [`ParallelUtilities.elementwiseproduct!`](@ref)
+3. [`ParallelUtilities.elementwisemin!`](@ref)
+4. [`ParallelUtilities.elementwisemax!`](@ref)
+
+Each of these functions overwrites the first argument with the result.
+
+!!! warn
+    The pre-defined elementwise operators are assumed to be commutative, so, if used in `pmapreduce`, the order of arguments passed to the function is not guaranteed. In particular this might not be in order of the `workerrank`. These functions should only be used if both the arguments support the inplace assignment, eg. if they have identical axes.
+
+## Flip
+
+The [`ParallelUtilities.Flip`](@ref) function may be used to wrap a binary function to flips the order of arguments. For example
+
+```jldoctest
+julia> vcat(1,2)
+2-element Vector{Int64}:
+ 1
+ 2
+
+julia> ParallelUtilities.Flip(vcat)(1,2)
+2-element Vector{Int64}:
+ 2
+ 1
+```
+
+`Flip` may be combined with inplace assignment operators to change the argument that is overwritten.
+
+```jldoctest
+julia> x = ones(3); y = ones(3);
+
+julia> op1 = ParallelUtilities.elementwisesum!; # overwrites the first argument
+
+julia> op1(x, y); x
+3-element Vector{Float64}:
+ 2.0
+ 2.0
+ 2.0
+
+julia> x = ones(3); y = ones(3);
+
+julia> op2 = ParallelUtilities.Flip(op1); # ovrewrites the second argument
+
+julia> op2(x, y); y
+3-element Vector{Float64}:
+ 2.0
+ 2.0
+ 2.0
+```
+
+## BroadcastStack
+
+This function may be used to combine arrays having overlapping axes to obtain a new array that spans the union of axes of the arguments. The overlapping section is computed by applying the reduction function to that section.
+
+We construct a function that concatenates arrays along the first dimension with overlapping indices summed.
+```jldoctest broadcaststack
+julia> f = ParallelUtilities.BroadcastStack(+, 1);
+```
+
+We apply this to two arrays having different indices
+```jldoctest broadcaststack
+julia> f(ones(2), ones(4))
+4-element Vector{Float64}:
+ 2.0
+ 2.0
+ 1.0
+ 1.0
+```
+
+This function is useful to reduce [`OffsetArray`s](https://github.com/JuliaArrays/OffsetArrays.jl) where each process evaluates a potentially overlapping section of the entire array.
+
+!!! note
+    A `BroadcastStack` function requires its arguments to have the same dimensionality, and identical axes along non-concatenated dimensions. In particular it is not possible to block-concatenate arrays using this function.
+
+!!! note
+    A `BroadcastStack` function does not operate in-place.
+
+## Commutative
+
+In general this package does not assume that a reduction operator is commutative. It's possible to declare an operator to be commutative in its arguments by wrapping it in the tag [`ParallelUtilities.Commutative`](@ref). 
diff --git a/src/ParallelUtilities.jl b/src/ParallelUtilities.jl
index 603fc5a..27d1750 100644
--- a/src/ParallelUtilities.jl
+++ b/src/ParallelUtilities.jl
@@ -1,35 +1,20 @@
 module ParallelUtilities
-using ProgressMeter
-using Reexport
-using OffsetArrays
 
-@reexport using Distributed
+using Distributed
 
-export ProductSplit,
-    ntasks,
-    whichproc,
-    procrange_recast,
-    localindex,
-    whichproc_localindex,
-    extremadims,
-    extrema_commonlastdim,
-    pmapbatch,
-    pmapbatch_elementwise,
-    pmapsum,
-    pmapsum_elementwise,
-    pmapreduce,
-    pmapreduce_commutative,
-    pmapreduce_commutative_elementwise
+export pmapreduce
+export pmapreduce_productsplit
+export pmapbatch
+export pmapbatch_productsplit
+export workerrank
 
-include("errors.jl")
 include("productsplit.jl")
 
 include("clusterquery.jl")
-@reexport using .ClusterQueryUtils
+using .ClusterQueryUtils: procs_node
 
-include("utils.jl")
 include("trees.jl")
-include("mapreduce.jl")
 include("reductionfunctions.jl")
+include("mapreduce.jl")
 
 end # module
diff --git a/src/clusterquery.jl b/src/clusterquery.jl
index cf8f23b..01c3c4d 100644
--- a/src/clusterquery.jl
+++ b/src/clusterquery.jl
@@ -3,33 +3,18 @@ module ClusterQueryUtils
 using Distributed
 using DataStructures
 
-@deprecate gethostnames hostnames
 export hostnames
 export nodenames
 export procs_node
 export nprocs_node
 
 """
-    gethostnames(procs = workers())
+    hostnames([procs = workers()])
 
-Return the hostname of each worker in `procs`. This is obtained by evaluating 
-`Libc.gethostname()` on each worker asynchronously.
-
-!!! warn
-    `gethostnames` is deprecated in favor of `hostnames`    
-"""
-gethostnames
-
-"""
-    hostnames(procs = workers())
-
-Return the hostname of each worker in `procs`. This is obtained by evaluating 
+Return the hostname of each worker in `procs`. This is obtained by evaluating
 `Libc.gethostname()` on each worker asynchronously.
 """
 function hostnames(procs = workers())
-    Base.depwarn("hostnames will not be exported in a future release. "*
-    "It may be imported from the module ClusterQueryUtils", :hostnames)
-
     hostnames = Vector{String}(undef, length(procs))
 
     @sync for (ind,p) in enumerate(procs)
@@ -40,22 +25,19 @@ end
 
 
 """
-    nodenames(procs = workers())
+    nodenames([procs = workers()])
 
-Return the unique hostnames that the workers in `procs` lie on. 
+Return the unique hostnames that the workers in `procs` lie on.
 On an HPC system these are usually the hostnames of the nodes involved.
 """
 nodenames(procs = workers()) = nodenames(hostnames(procs))
 
 function nodenames(hostnames::AbstractVector{String})
-   Base.depwarn("nodenames will not be exported in a future release. "*
-    "It may be imported from the module ClusterQueryUtils", :nodenames)
-    
     unique(hostnames)
 end
 
 """
-    procs_node(procs = workers())
+    procs_node([procs = workers()])
 
 Return the worker ids on each host of the cluster.
 On an HPC system this would return the workers on each node.
@@ -67,14 +49,11 @@ function procs_node(procs = workers())
 end
 
 function procs_node(procs, hosts, nodes)
-    Base.depwarn("procs_node will not be exported in a future release. "*
-        "It may be imported from the module ClusterQueryUtils", :procs_node)
-    
     OrderedDict(node => procs[findall(isequal(node),hosts)] for node in nodes)
 end
 
 """
-    nprocs_node(procs = workers())
+    nprocs_node([procs = workers()])
 
 Return the number of workers on each host.
 On an HPC system this would return the number of workers on each node.
@@ -89,16 +68,10 @@ function nprocs_node(hostnames::AbstractVector{String})
 end
 
 function nprocs_node(hostnames::AbstractVector, nodes::AbstractVector)
-    Base.depwarn("nprocs_node will not be exported in a future release. "*
-        "It may be imported from the module ClusterQueryUtils", :nprocs_node)
-
     OrderedDict(node => count(isequal(node), hostnames) for node in nodes)
 end
 
 function nprocs_node(d::AbstractDict)
-    Base.depwarn("nprocs_node will not be exported in a future release. "*
-        "It may be imported from the module ClusterQueryUtils", :nprocs_node)
-
     OrderedDict(node => length(procs) for (node, procs) in d)
 end
 
diff --git a/src/errors.jl b/src/errors.jl
deleted file mode 100644
index c1de4f5..0000000
--- a/src/errors.jl
+++ /dev/null
@@ -1,7 +0,0 @@
-struct TaskNotPresentError{T,U} <: Exception
-	t :: T
-	task :: U
-end
-function Base.showerror(io::IO,err::TaskNotPresentError)
-	print(io,"could not find the task $(err.task) in the list $(err.t)")
-end
\ No newline at end of file
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 4033abe..6b73f60 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -1,66 +1,109 @@
 # Store the rank with the value, necessary for collecting values in order
 struct pval{T}
     rank :: Int
+    errorstatus :: Bool
     value :: T
 end
+pval{T}(p::pval{T}) where {T} = p
+pval{T}(p::pval) where {T} = pval{T}(p.rank, p.errorstatus, convert(T, p.value))
+
+errorpval(rank) = pval(rank, true, nothing)
+
+errorstatus(p::pval) = p.errorstatus
 
 # Function to obtain the value of pval types
 value(p::pval) = p.value
 value(p::Any) = p
 
-Base.convert(::Type{pval{T}},p::pval) where {T} = pval{T}(p.rank,convert(T,value(p)))
+struct Product{I}
+    iterators :: I
+end
 
-Base.:(==)(p1::pval,p2::pval) = (p1.rank == p2.rank) && (value(p1) == value(p2))
+getiterators(p::Product) = p.iterators
 
-############################################################################################
-# Map
-############################################################################################
+Base.length(p::Product) = length(Iterators.product(p.iterators...))
+Base.iterate(p::Product, st...) = iterate(Iterators.product(p.iterators...), st...)
+
+function product(iter...)
+    any(x -> x isa Product, iter) && throw(ArgumentError("the iterators should not be Products"))
+    Product(iter)
+end
+
+struct Hold{T}
+    iterators :: T
+end
+
+getiterators(h::Hold) = getiterators(h.iterators)
 
-# Wrap a pval around the mapped value if sorting is necessary
-function maybepvalput!(pipe::BranchChannel{T}, rank, val) where {T}
-    put!(pipe.selfchannels.out,val)
+Base.length(h::Hold) = length(h.iterators)
+
+check_knownsize(iterators::Tuple) = _check_knownsize(first(iterators)) & check_knownsize(Base.tail(iterators))
+check_knownsize(::Tuple{}) = true
+function _check_knownsize(iterator)
+    itsz = Base.IteratorSize(iterator)
+    itsz isa Base.HasLength || itsz isa Base.HasShape
 end
-function maybepvalput!(pipe::BranchChannel{T}, rank, val) where {T<:pval}
-    valT = T(rank,value(val))
-    put!(pipe.selfchannels.out, valT)
+
+function zipsplit(iterators::Tuple, np::Integer, p::Integer)
+    check_knownsize(iterators)
+    itzip = zip(iterators...)
+    d,r = divrem(length(itzip), np)
+    skipped_elements = d*(p-1) + min(r,p-1)
+    lastind = d*p + min(r,p)
+    elements_on_proc = lastind - skipped_elements
+    Iterators.take(Iterators.drop(itzip, skipped_elements), elements_on_proc)
 end
 
-function indicatemapprogress!(::Nothing, rank) end
-function indicatemapprogress!(progress::RemoteChannel, rank)
-    put!(progress, (true,false,rank))
+_split_iterators(iterators, np, p) = (zipsplit(iterators, np, p),)
+function _split_iterators(iterators::Tuple{Hold{<:Product}}, np, p)
+    it_hold = first(iterators)
+    (ProductSplit(getiterators(it_hold), np, p), )
+end
+
+############################################################################################
+# Local mapreduce
+############################################################################################
+
+struct NoSplat <: Function
+    f :: Function
 end
+NoSplat(u::NoSplat) = u
+
+_maybesplat(f) = Base.splat(f)
+_maybesplat(f::NoSplat) = f
 
-function indicatefailure!(::Nothing, rank) end
-function indicatefailure!(progress::RemoteChannel, rank)
-    put!(progress, (false,false,rank))
+_mapreduce(f, op, iterators...; reducekw...) = mapreduce(f, op, iterators...; reducekw...)
+function _mapreduce(fun::NoSplat, op, iterators...; reducekw...)
+    mapval = fun.f(iterators...)
+    reduce(op, (mapval,); reducekw...)
 end
 
-function mapTreeNode(fmap::Function, iterator, rank, pipe::BranchChannel,
-    progress::Union{Nothing,RemoteChannel}, args...;kwargs...)
+function mapreducenode(f, op, rank, pipe::BranchChannel, selfoutchannel, iterators...; reducekw...)
     # Evaluate the function
-    # Store the error flag locally
-    # If there are no errors then store the result locally
-    # No communication with other nodes happens here other than indicating the progress status
+    # No communication with other nodes happens here
     try
-        res = fmap(iterator, args...;kwargs...)
-        maybepvalput!(pipe, rank, res)
-        put!(pipe.selfchannels.err, false)
-        indicatemapprogress!(progress, rank)
+        fmap = _maybesplat(f)
+        if rank == 1
+            res = _mapreduce(fmap, op, iterators...; reducekw...)
+        else
+            # init should only be used once on the first rank
+            # remove it from the kwargs on other workers
+            kwdict = Dict(reducekw)
+            pop!(kwdict, :init, nothing)
+            res = _mapreduce(fmap, op, iterators...; kwdict...)
+        end
+        val = pval(rank, false, res)
+        put!(selfoutchannel, val)
     catch
-        put!(pipe.selfchannels.err, true)
-        indicatefailure!(progress, rank)
+        put!(selfoutchannel, errorpval(rank))
         rethrow()
     end
 end
 
 ############################################################################################
-# Reduction
+# Reduction across workers
 ############################################################################################
 
-abstract type Ordering end
-struct Sorted <: Ordering end
-struct Unsorted <: Ordering end
-
 abstract type ReductionNode end
 struct TopTreeNode <: ReductionNode
     rank :: Int
@@ -69,591 +112,208 @@ struct SubTreeNode <: ReductionNode
     rank :: Int
 end
 
-function reducedvalue(freduce::Function, rank,
-    pipe::BranchChannel, ifsorted::Ordering)
+_maybesort(op::Commutative, vals) = vals
+_maybesort(op, vals) = sort!(vals, by = pv -> pv.rank)
 
-    reducedvalue(freduce,
-        rank > 0 ? SubTreeNode(rank) : TopTreeNode(rank),
-        pipe, ifsorted)
+function reducechannel(op, c, N; reducekw...)
+    vals = [take!(c) for i = 1:N]
+    vals = _maybesort(op, vals)
+    v = [value(v) for v in vals]
+    reduce(op, v; reducekw...)
 end
 
-function reducedvalue(freduce::Function, node::SubTreeNode,
-    pipe::BranchChannel{Tmap,Tred}, ::Unsorted) where {Tmap,Tred}
-
-    self = take!(pipe.selfchannels.out) :: Tmap
-    N = nchildren(pipe)
-    vals = Vector{Tred}(undef, N + 1)
-    
-    vals[1] = freduce((self,)) :: Tred
-    
-    for i = 1:N
-        vals[i+1] = take!(pipe.childrenchannels.out)::Tred
-    end
-    
-    freduce(vals)
-end
-function reducedvalue(freduce::Function, node::TopTreeNode,
-    pipe::BranchChannel{<:Any,Tred}, ::Unsorted) where {Tred}
-
-    N = nchildren(pipe)
-    if N == 0
-        # shouldn't reach this
-        error("Nodes on the top tree must have children")
-    end
-    
-    vals = Vector{Tred}(undef, N)
-    
-    for i = 1:N
-        vals[i] = take!(pipe.childrenchannels.out)::Tred
-    end
-
-    freduce(vals)
-end
-
-function reducedvalue(freduce::Function, node::SubTreeNode,
-    pipe::BranchChannel{Tmap,Tred}, ::Sorted) where {Tmap,Tred}
+seterrorflag(c, val) = put!(c, take!(c) | val)
 
+function reducedvalue(op, node::SubTreeNode, pipe::BranchChannel, selfoutchannel; reducekw...)
     rank = node.rank
-    N = nchildren(pipe)
-    leftchild = N > 0
-    vals = Vector{Tred}(undef, N + 1)
-    
-    selfval = take!(pipe.selfchannels.out)::Tmap
-    selfvalred = freduce((value(selfval),))
-    pv = pval(rank,selfvalred)
-    ind = leftchild + 1
-    vals[ind] = pv
-
-    for i = 1:N
-        pv = take!(pipe.childrenchannels.out) :: Tred
-        shift = pv.rank > rank ? 1 : -1
-        ind = shift + leftchild + 1
-        vals[ind] = pv
-    end
 
-    Tred(rank, freduce(value(v) for v in vals))
-end
-function reducedvalue(freduce::Function, node::TopTreeNode,
-    pipe::BranchChannel{<:Any,Tred}, ::Sorted) where {Tred}
+    N = nchildren(pipe) + 1
+    err_ch = Channel{Bool}(1)
+    put!(err_ch, false)
 
-    rank = node.rank
-    N = nchildren(pipe)
-    if N == 0
-        # shouldn't reach this
-        error("Nodes on the top tree must have children")
+    self_pval = take!(selfoutchannel)
+    if errorstatus(self_pval)
+        return errorpval(rank)
+    else
+        put!(selfoutchannel, self_pval)
     end
 
-    vals = Vector{Tred}(undef, N)
-
-    for i = 1:N
-        pv = take!(pipe.childrenchannels.out) :: Tred
-        vals[i] = pv
+    @sync for i = 1:nchildren(pipe)
+        @async begin
+            child_pval = take!(pipe.childrenchannel)
+            if errorstatus(child_pval)
+                seterrorflag(err_ch, true)
+            else
+                put!(selfoutchannel, child_pval)
+                seterrorflag(err_ch, false)
+            end
+        end
     end
 
-    sort!(vals, by = pv -> pv.rank)
-
-    Tred(rank, freduce(value(v) for v in vals))
-end
-
-function indicatereduceprogress!(::Nothing,rank) end
-function indicatereduceprogress!(progress::RemoteChannel,rank)
-    put!(progress,(false,true,rank))
-end
+    take!(err_ch) && return errorpval(rank)
 
-function reduceTreeNode(freduce::Function, rank, pipe::BranchChannel,
-    ifsort::Ordering, progress)
-    
-    reduceTreeNode(freduce,
-        rank > 0 ? SubTreeNode(rank) : TopTreeNode(rank),
-        pipe, ifsort, progress)
-end
+    redval = reducechannel(op, selfoutchannel, N; reducekw...)
 
-function checkerror(::SubTreeNode, pipe::BranchChannel)
-    selferr = take!(pipe.selfchannels.err)
-    childrenerr = any(take!(pipe.childrenchannels.err) for i=1:nchildren(pipe))
-    selferr || childrenerr
-end
-function checkerror(::TopTreeNode, pipe::BranchChannel)
-    any(take!(pipe.childrenchannels.err) for i=1:nchildren(pipe))
+    return pval(rank, false, redval)
 end
+function reducedvalue(op, node::TopTreeNode, pipe::BranchChannel, ::Any; reducekw...)
+    rank = node.rank
 
-function reduceTreeNode(freduce::Function, node::ReductionNode,
-    pipe::BranchChannel{<:Any,Tred},
-    ifsort::Ordering, progress::Union{Nothing,RemoteChannel}) where {Tred}
-    # This function that communicates with the parent and children
+    N = nchildren(pipe)
+    c = Channel(N)
+    err_ch = Channel{Bool}(1)
+    put!(err_ch, false)
 
-    # Start by checking if there is any error locally in the map,
-    # and if there's none then check if there are any errors on the children
-    anyerr = checkerror(node, pipe)
-    rank = node.rank
-    # Evaluate the reduction only if there's no error
-    # In either case push the error flag to the parent
-    if !anyerr
-        try
-            res = reducedvalue(freduce, node, pipe, ifsort) :: Tred
-            put!(pipe.parentchannels.out, res)
-            put!(pipe.parentchannels.err, false)
-            indicatereduceprogress!(progress, rank)
-        catch e
-            put!(pipe.parentchannels.err, true)
-            indicatefailure!(progress, rank)
-            rethrow()
+    @sync for i in 1:N
+        @async begin
+            child_pval = take!(pipe.childrenchannel)
+            if errorstatus(child_pval)
+                seterrorflag(err_ch, true)
+            else
+                put!(c, child_pval)
+                seterrorflag(err_ch, false)
+            end
         end
-    else
-        put!(pipe.parentchannels.err, true)
-        indicatefailure!(progress, rank)
     end
 
-    finalize(pipe)
-end
+    take!(err_ch) && return errorpval(rank)
 
-function return_unless_error(r::RemoteChannelContainer)
-    anyerror = take!(r.err)
-    if !anyerror
-        return value(take!(r.out))
-    end
-end
+    redval = reducechannel(op, c, N; reducekw...)
 
-function return_unless_error(b::BranchChannel)
-    return_unless_error(b.parentchannels)
+    return pval(rank, false, redval)
 end
 
-function pmapreduceworkers(fmap::Function, freduce::Function, iterators::Tuple,
-    tree, branches, ord::Ordering, args...;
-    kwargs...)
-
-    kwargs_ = Dict(kwargs)
-    if haskey(kwargs, :showprogress)
-        Base.depwarn("showprogress is deprecated and will be removed in a future release", :pmapreduceworkers)
+function reducenode(op, node::ReductionNode, pipe::BranchChannel, selfoutchannel = nothing; kwargs...)
+    # This function that communicates with the parent and children
+    rank = node.rank
+    try
+        kwdict = Dict(kwargs)
+        pop!(kwdict, :init, nothing)
+        res = reducedvalue(op, node, pipe, selfoutchannel; kwdict...)
+        put!(pipe.parentchannel, res)
+    catch
+        put!(pipe.parentchannel, errorpval(rank))
+        rethrow()
+    finally
+        GC.gc()
     end
 
-    showprogress = get(kwargs, :showprogress, false)
-    progressdesc = get(kwargs, :progressdesc, "Progress in pmapreduce : ")
-    delete!(kwargs_, :showprogress)
-    delete!(kwargs_, :progressdesc)
-
-    num_workers_active = nworkersactive(iterators)
-    Nmaptotal = num_workers_active
-    Nreducetotal = length(branches)
-    extrareducenodes = Nreducetotal - Nmaptotal
-    
-    Nprogress = Nmaptotal+Nreducetotal
-    progresschannel = RemoteChannel(()->Channel{Tuple{Bool,Bool,Int}}(
-                        ifelse(showprogress,Nprogress,0)))
-    progressbar = Progress(Nprogress,1,progressdesc)
-
-    @sync begin
-
-        for (ind,mypipe) in enumerate(branches)
-            p = mypipe.p
-            ind_reduced = ind - extrareducenodes
-            rank = ind_reduced
-            if ind_reduced > 0
-                iterable_on_proc = ProductSplit(iterators,num_workers_active,rank)
-
-                @spawnat p mapTreeNode(fmap, iterable_on_proc, rank, mypipe,
-                    showprogress ? progresschannel : nothing,
-                    args...; kwargs_...)
-
-                @spawnat p reduceTreeNode(freduce, SubTreeNode(rank),
-                    mypipe, ord, showprogress ? progresschannel : nothing)
-            else
-                @spawnat p reduceTreeNode(freduce, TopTreeNode(rank),
-                    mypipe, ord, showprogress ? progresschannel : nothing)
-            end
-        end
-
-        if showprogress
-
-            mapdone,reducedone = 0,0
+    return nothing
+end
 
-            for i = 1:Nprogress
-                mapflag,redflag,rank = take!(progresschannel)
-                # both flags are false in case of an error
-                mapflag || redflag || break
+function pmapreduceworkers(f, op, tree_branches, iterators; reducekw...)
 
-                mapdone += mapflag
-                reducedone += redflag
+    tree, branches = tree_branches
 
-                if mapdone != Nmaptotal && reducedone != Nreducetotal
-                    showvalues = [
-                    (:map, string(mapdone)*"/"*string(Nmaptotal)),
-                    (:reduce, string(reducedone)*"/"*string(Nreducetotal))
-                    ]
+    nworkerstree = nworkers(tree)
+    extrareducenodes = length(tree) - nworkerstree
 
-                elseif reducedone != Nreducetotal
-                    showvalues = [
-                    (:reduce, string(reducedone)*"/"*string(Nreducetotal))
-                    ]
-                else
-                    showvalues = []
+    @sync for (ind, mypipe) in enumerate(branches)
+        p = mypipe.p
+        ind_reduced = ind - extrareducenodes
+        rank = ind_reduced
+        if ind_reduced > 0
+            iterable_on_proc = _split_iterators(iterators, nworkerstree, rank)
+            @spawnat p begin
+                selfoutchannel = Channel(nchildren(mypipe) + 1)
+                @sync begin
+                    @async mapreducenode(f, op, rank, mypipe, selfoutchannel, iterable_on_proc...; reducekw...)
+                    @async reducenode(op, SubTreeNode(rank), mypipe, selfoutchannel; reducekw...)
                 end
-
-                next!(progressbar;showvalues=showvalues)
             end
+        else
+            @spawnat p reducenode(op, TopTreeNode(rank), mypipe; reducekw...)
         end
     end
 
-    return_unless_error(topbranch(tree,branches))
+    tb = topbranch(tree, branches)
+    value(take!(tb.parentchannel))
 end
 
 """
-    pmapreduce_commutative(fmap, freduce, iterators, [mapargs...]; 
-        <keyword arguments>, [mapkwargs...])
-
-Evaluate a parallel mapreduce over the range spanned by 
-the outer product of the iterators. The operation
-is assumed to be commutative, results obtained may be incorrect otherwise.
-
-The argument  `iterators` must be a strictly-increasing range of integers, 
-or a tuple of such ranges. 
-The outer product of the ranges is split evenly across the workers. 
-The function `fmap` receives a `ProductSplit` iterator as its first argument
-that acts as a collection of tuples. One may index into a `ProductSplit` 
-or iterate over one to access individual tuples of integers.
-
-The reduction function `freduce` is expected to accept a collection of mapped values.
-Note that this is different from the standard `mapreduce` operation in julia that 
-expects a binary reduction operator. For example, `freduce` should be 
-`sum` and not `+`. In case a binary operator `op` is to be used in the reduction, one may pass it 
-as `Base.splat(op)` or wrap it in an anonymous function as `x -> op(x...)`.
-
-Arguments `mapargs` and keyword arguments `mapkwargs` — if provided — are 
-passed on to the mapping function `fmap`.
-
-    pmapreduce_commutative(fmap, Tmap, freduce, Treduce, iterators, [mapargs...]; 
-        <keyword arguments>, [mapkwargs...])
-
-The types `Tmap` and `Treduce` are the return types of the map and 
-reduce operations respectively. The returned values will be coerced to 
-the specified types if possible, throwing an error otherwise. 
+    pmapreduce(f, op, [pool::AbstractWorkerPool], iterators...; reducekw...)
 
-# Keyword Arguments
+Evaluate a parallel `mapreduce` over the elements from `iterators`.
+For multiple iterators, apply `f` elementwise.
 
-- `showprogress::Bool = false` : Displays a progress-bar if set to true
-- `progressdesc = "Progress in pmapreduce : "` : Leading text in the progress-bar
+The keyword arguments `reducekw` are passed on to the reduction.
 
-See also: [`pmapreduce_commutative_elementwise`](@ref), [`pmapreduce`](@ref), [`pmapsum`](@ref)
+See also: [`pmapreduce_productsplit`](@ref)
 """
-function pmapreduce_commutative(fmap::Function, Tmap::Type,
-    freduce::Function, Tred::Type, iterators::Tuple, args...;
-    kwargs...)
-    
-    tree,branches = createbranchchannels(Tmap,Tred,iterators,
-        SegmentedSequentialBinaryTree)
-    
-    pmapreduceworkers(fmap, freduce, iterators, tree,
-        branches, Unsorted(), args...;kwargs...)
-end
-
-function pmapreduce_commutative(fmap::Function, freduce::Function,
-    iterators::Tuple, args...;kwargs...)
-
-    pmapreduce_commutative(fmap, Any, freduce, Any, iterators, args...;kwargs...)
-end
-
-function pmapreduce_commutative(fmap::Function, Tmap::Type,
-    freduce::Function, Tred::Type, iterable, args...;kwargs...)
+function pmapreduce(f, op, pool::AbstractWorkerPool, iterators...; reducekw...)
+    N = length(zip(iterators...))
+
+    if N <= 1 || nworkers(pool) == 1
+        iterable_on_proc = _split_iterators(iterators, 1, 1)
+        fmap = _maybesplat(f)
+        if nprocs() == 1 # no workers added
+            return _mapreduce(fmap, op, iterable_on_proc...; reducekw...)
+        else # one worker or single-valued iterator
+            return @fetchfrom workers(pool)[1] _mapreduce(fmap, op, iterable_on_proc...; reducekw...)
+        end
+    end
 
-    pmapreduce_commutative(fmap, Tmap, freduce, Tred, (iterable,), args...;kwargs...)
+    tree_branches = createbranchchannels(pool, N)
+    pmapreduceworkers(f, op, tree_branches, iterators; reducekw...)
 end
 
-function pmapreduce_commutative(fmap::Function, freduce::Function, iterable, args...;kwargs...)
-    pmapreduce_commutative(fmap, freduce, (iterable,), args...;kwargs...)
+function pmapreduce(f, op, iterators...; reducekw...)
+    N = length(zip(iterators...))
+    pool = maybetrimmedworkerpool(workers(), N)
+    pmapreduce(f, op, pool, iterators...; reducekw...)
 end
 
 """
-    pmapreduce_commutative_elementwise(fmap, freduce, iterators, [mapargs...]; 
-        <keyword arguments>, [mapkwargs...])
-
-Evaluate a parallel mapreduce over the range spanned by 
-the outer product of the iterators. 
-The argument `iterators` must be a strictly-increasing range of integers, 
-or a tuple of such ranges. The map is evaluated elementwise 
-over the entire range of parameters.
-The reduction is assumed to be commutative, 
-results obtained may be incorrect otherwise.
-
-The reduction function `freduce` is expected to accept a collection of mapped values.
-Note that this is different from the standard `mapreduce` operation in julia that 
-expects a binary reduction operator. For example, `freduce` should be 
-`sum` and not `+`. In case a binary operator `op` is to be used in the reduction, one may pass it 
-as `Base.splat(op)` or wrap it in an anonymous function as `x -> op(x...)`.
-
-Arguments `mapargs` and keyword arguments `mapkwargs` — if provided — are 
-passed on to the mapping function `fmap`.
+    pmapreduce_productsplit(f, op, [pool::AbstractWorkerPool], iterators...; reducekw...)
 
-    pmapreduce_commutative_elementwise(fmap, Tmap, freduce, Treduce, iterators, [mapargs...]; 
-        <keyword arguments>, [mapkwargs...])
+Evaluate a parallel mapreduce over the outer product of elements from `iterators`.
+The product of `iterators` is split over the workers available, and each worker is assigned a section
+of the product. The function `f` should accept a single argument that is a collection of `Tuple`s.
 
-The types `Tmap` and `Treduce` are the return types of the map and 
-reduce operations respectively. The returned values will be coerced to 
-the specified types if possible, throwing an error otherwise. 
+The keyword arguments `reducekw` are passed on to the reduction.
 
-# Keyword Arguments
-
-- `showprogress::Bool = false` : Displays a progress-bar if set to true
-- `progressdesc = "Progress in pmapreduce : "` : Leading text in the progress-bar
-
-See also: [`pmapreduce_commutative`](@ref)
+See also: [`pmapreduce`](@ref)
 """
-function pmapreduce_commutative_elementwise(fmap::Function, Tmap::Type,
-    freduce::Function, Tred::Type, iterable, args...;
-    showprogress::Bool = false, progressdesc = "Progress in pmapreduce : ",
-    kwargs...)
-    
-    pmapreduce_commutative(
-        plist->freduce((fmap(x...,args...;kwargs...) for x in plist)),
-        Tred,freduce,Tred,iterable,
-        showprogress = showprogress, progressdesc = progressdesc)
-end
+pmapreduce_productsplit(f, op, pool::AbstractWorkerPool, iterators...; reducekw...) =
+    pmapreduce(NoSplat(f), op, pool, Hold(product(iterators...)); reducekw...)
 
-function pmapreduce_commutative_elementwise(fmap::Function, freduce::Function, iterable, args...;
-    showprogress::Bool = false, progressdesc = "Progress in pmapreduce : ",
-    kwargs...)
-
-    pmapreduce_commutative(
-        plist->freduce((fmap(x...,args...;kwargs...) for x in plist)),
-        freduce,iterable,
-        showprogress = showprogress, progressdesc = progressdesc)
+function pmapreduce_productsplit(f, op, iterators...; reducekw...)
+    N = length(product(iterators...))
+    pool = maybetrimmedworkerpool(workers(), N)
+    pmapreduce_productsplit(f, op, pool, iterators...; reducekw...)
 end
 
 """
-    pmapsum(fmap, iterators, [mapargs...]; 
-        <keyword arguments>, [mapkwargs...])
-
-Evaluate a parallel mapreduce over the range spanned by 
-the outer product of the iterators, where the reduction operation
-is a sum.
-
-The argument `iterators` must be a strictly-increasing range of integers, 
-or a tuple of such ranges.
-The outer product of the ranges is split evenly across the workers. 
-The function `fmap` receives a `ProductSplit` iterator as its first argument
-that acts as a collection of tuples. One may index into a `ProductSplit` 
-or iterate over one to access individual tuples of integers.
-
-Arguments `mapargs` and keyword arguments `mapkwargs` — if provided — are 
-passed on to the mapping function `fmap`.
-
-    pmapsum(fmap, Tmap, iterators, [mapargs...]; 
-        <keyword arguments>, [mapkwargs...])
-
-The types `Tmap` is the return types of the map. 
-The returned values will be coerced to 
-the specified type if possible, throwing an error otherwise.
-
-# Keyword Arguments
+    pmapbatch(f, [pool::AbstractWorkerPool], iterators...)
 
-- `showprogress::Bool = false` : Displays a progress-bar if set to true
-- `progressdesc = "Progress in pmapsum : "` : Leading text in the progress-bar
+Carry out a `pmap` with the `iterators` divided evenly among the available workers.
 
-See also: [`pmapreduce`](@ref), [`pmapreduce_commutative`](@ref)
+See also: [`pmapreduce`](@ref)
 """
-function pmapsum(fmap::Function, T::Type, iterable, args...;kwargs...)
-    pmapreduce_commutative(fmap, T, sum, T, iterable, args...;
-        progressdesc = "Progress in pmapsum : ", kwargs...)
+function pmapbatch(f, pool::AbstractWorkerPool, iterators...)
+    pmapreduce((x...) -> [f(x...)], vcat, pool, iterators...)
 end
 
-function pmapsum(fmap::Function, iterable, args...;kwargs...)
-    pmapreduce_commutative(fmap, sum, iterable, args...;
-        progressdesc = "Progress in pmapsum : ", kwargs...)
+function pmapbatch(f, iterators...)
+    N = length(zip(iterators...))
+    pool = maybetrimmedworkerpool(workers(), N)
+    pmapbatch(f, pool, iterators...)
 end
 
 """
-    pmapsum_elementwise(fmap, iterators, [mapargs...]; 
-        <keyword arguments>, [mapkwargs...])
+    pmapbatch_productsplit(f, [pool::AbstractWorkerPool], iterators...)
 
-Evaluate a parallel mapreduce over the range spanned by 
-the outer product of the iterators, where the reduction operation is a sum. 
-The argument `iterators` must be a strictly-increasing range of integers, 
-or a tuple of such ranges. The map is evaluated elementwise 
-over the entire range of parameters.
+Carry out a `pmap` with the outer product of `iterators` divided evenly among the available workers.
+The function `f` must accept a collection of `Tuple`s.
 
-Arguments `mapargs` and keyword arguments `mapkwargs` — if provided — are 
-passed on to the mapping function `fmap`.
-
-    pmapsum_elementwise(fmap, Tmap, iterators, [mapargs...]; 
-        <keyword arguments>, [mapkwargs...])
-
-The type `Tmap` is the return type of the map. 
-The returned values will be coerced to 
-the specified type if possible, throwing an error otherwise.
-
-# Keyword Arguments
-
-- `showprogress::Bool = false` : Displays a progress-bar if set to true
-- `progressdesc = "Progress in pmapreduce : "` : Leading text in the progress-bar
-
-See also: [`pmapreduce_commutative_elementwise`](@ref), [`pmapsum`](@ref)
+See also: [`pmapbatch`](@ref), [`pmapreduce_productsplit`](@ref)
 """
-function pmapsum_elementwise(fmap::Function, T::Type, iterable,args...;
-    showprogress::Bool = false, progressdesc = "Progress in pmapsum : ",
-    kwargs...)
-
-    pmapsum(plist->sum(x->fmap(x...,args...;kwargs...),plist),T,iterable,
-        showprogress = showprogress, progressdesc = progressdesc)
+function pmapbatch_productsplit(f, pool::AbstractWorkerPool, iterators...)
+    pmapreduce_productsplit(x -> [f(x)], vcat, pool, iterators...)
 end
 
-function pmapsum_elementwise(fmap::Function, iterable, args...;
-    showprogress::Bool = false, progressdesc = "Progress in pmapsum : ",
-    kwargs...)
-
-    pmapsum(plist->sum(x->fmap(x...,args...;kwargs...),plist),iterable,
-        showprogress = showprogress, progressdesc = progressdesc)
+function pmapbatch_productsplit(f, iterators...)
+    N = length(product(iterators...))
+    pool = maybetrimmedworkerpool(workers(), N)
+    pmapbatch_productsplit(f, pool, iterators...)
 end
-
-"""
-    pmapreduce(fmap, freduce, iterators, [mapargs...]; 
-        <keyword arguments>, [mapkwargs...])
-
-Evaluate a parallel mapreduce over the range spanned by 
-the outer product of the iterators.
-
-`iterators` must be a strictly-increasing range of integers, 
-or a tuple of such ranges. 
-The outer product of the ranges is split evenly across the workers. 
-The function `fmap` receives a `ProductSplit` iterator as its first argument
-that acts as a collection of tuples. One may index into a `ProductSplit` 
-or iterate over one to access individual tuples of integers.
-
-The reduction function `freduce` is expected to accept a collection of mapped values.
-Note that this is different from the standard `mapreduce` operation in julia that 
-expects a binary reduction operator. For example, `fmap` should be 
-`sum` and not `+`. In case a binary operator `op` is to be used in the reduction, one may pass it 
-as `Base.splat(op)` or wrap it in an anonymous function as `x -> op(x...)`.
-
-Arguments `mapargs` and keyword arguments `mapkwargs` — if provided — are 
-passed on to the mapping function `fmap`.
-
-    pmapreduce(fmap, Tmap, freduce, Treduce, iterators, [mapargs...]; 
-        <keyword arguments>, [mapkwargs...])
-
-The types `Tmap` and `Treduce` are the return types of the map and 
-reduce operations respectively. The returned values will be coerced to 
-the specified types if possible, throwing an error otherwise. 
-
-# Keyword Arguments
-
-- `showprogress::Bool = false` : Displays a progress-bar if set to true
-- `progressdesc = "Progress in pmapreduce : "` : Leading text in the progress-bar
-
-See also: [`pmapreduce_commutative`](@ref), [`pmapsum`](@ref)
-"""
-function pmapreduce(fmap::Function, Tmap::Type, freduce::Function, Tred::Type,
-    iterators::Tuple, args...;kwargs...)
-
-    tree,branches = createbranchchannels(pval{Tmap},pval{Tred},
-        iterators, SegmentedOrderedBinaryTree)
-    
-    pmapreduceworkers(fmap, freduce, iterators, tree,
-        branches, Sorted(), args...;kwargs...)
-end
-
-function pmapreduce(fmap::Function, freduce::Function, iterators::Tuple, args...;
-    kwargs...)
-
-    pmapreduce(fmap, Any, freduce, Any, iterators, args...;kwargs...)
-end
-
-function pmapreduce(fmap::Function, Tmap::Type, freduce::Function, Tred::Type,
-    iterable, args...;kwargs...)
-    
-    pmapreduce(fmap, Tmap, freduce, Tred, (iterable,), args...;kwargs...)
-end
-
-function pmapreduce(fmap::Function, freduce::Function, iterable, args...;kwargs...)
-    pmapreduce(fmap, freduce, (iterable,), args...;kwargs...)
-end
-
-############################################################################################
-# pmap in batches without reduction
-############################################################################################
-
-"""
-    pmapbatch(f, iterators, [mapargs...]; 
-        [num_workers::Int = nworkersactive(iterators)], [mapkwargs...])
-
-Evaluate the function `f` in parallel, where each worker gets a 
-part of the entire parameter space sequentially. The argument 
-`iterators` needs to be a strictly-increasing range,
-or a tuple of such ranges. The outer product of these ranges forms the 
-entire range of parameters that is processed in batches on 
-the workers. Arguments `mapargs` and keyword arguments `mapkwargs` — if provided — are 
-passed on to the function `f`. 
-
-Additionally, the number of workers to be used may be specified using the 
-keyword argument `num_workers`. In this case the first `num_workers` available
-workers are used in the evaluation.
-
-    pmapbatch(f, T::Type, iterators, [mapargs...];
-        [num_workers::Int = nworkersactive(iterators)], [mapkwargs...])
-
-Evaluate `f` in parallel, and convert the returned value to type `T`. 
-The method is type stable if `T` is concrete.
-Values returned by `f` will be type-coerced if possible, and an error will be raised otherwise.
-
-See also: [`pmapreduce`](@ref), [`pmapsum`](@ref)
-"""
-function pmapbatch(f::Function, iterators::Tuple, args...;
-    num_workers = nworkersactive(iterators),kwargs...)
-
-    pmapbatch(f, Any, iterators, args...; num_workers = num_workers, kwargs...)
-end
-
-function pmapbatch(f::Function, ::Type{T}, iterators::Tuple, args...;
-    num_workers = nworkersactive(iterators), kwargs...) where {T}
-
-    procs_used = workersactive(iterators)
-    if num_workers < length(procs_used)
-        procs_used = procs_used[1:num_workers]
-    end
-    num_workers = length(procs_used)
-
-    res = Vector{T}(undef, num_workers)
-
-    @sync for (rank,p) in enumerate(procs_used)
-        @async begin
-            iterable_on_proc = ProductSplit(iterators, num_workers, rank)
-            res[rank] = @fetchfrom p f(iterable_on_proc, args...;kwargs...)
-        end
-    end
-    
-    vcat(res...)
-end
-
-function pmapbatch(f::Function, T::Type, iterable, args...;kwargs...)
-    pmapbatch(f, T, (iterable,), args...;kwargs...)
-end
-
-function pmapbatch(f::Function, iterable, args...;kwargs...)
-    pmapbatch(f, (iterable,), args...;kwargs...)
-end
-
-"""
-    pmapbatch_elementwise(f, iterators, [mapargs...]; 
-        [num_workers::Int = nworkersactive(iterators)], [mapkwargs...])
-
-Evaluate the function `f` in parallel, where each worker gets a 
-part of the entire parameter space sequentially. The argument 
-`iterators` needs to be a strictly-increasing range of intergers,
-or a tuple of such ranges. The outer product of these ranges forms the 
-entire range of parameters that is processed elementwise by the function `f`.
-The individual tuples are splatted and passed as arguments to `f`.
-Given `n` ranges in `iterators`, the function `f` will receive `n` values 
-at a time.
-
-Arguments `mapargs` and keyword arguments `mapkwargs` — if provided — are 
-passed on to the function `f`. 
-Additionally, the number of workers to be used may be specified using the 
-keyword argument `num_workers`. In this case the first `num_workers` available
-workers are used in the evaluation.
-
-See also: [`pmapbatch`](@ref)
-"""
-function pmapbatch_elementwise(f::Function, iterators, args...;
-    num_workers = nworkersactive(iterators), kwargs...)
-
-    pmapbatch(plist->asyncmap(x->f(x...,args...;kwargs...),plist),
-        iterators,num_workers=num_workers)
-end
\ No newline at end of file
diff --git a/src/productsplit.jl b/src/productsplit.jl
index a59abc8..8bdbef4 100644
--- a/src/productsplit.jl
+++ b/src/productsplit.jl
@@ -1,50 +1,67 @@
+struct TaskNotPresentError{T,U} <: Exception
+    t :: T
+    task :: U
+end
+function Base.showerror(io::IO, err::TaskNotPresentError)
+    print(io, "could not find the task $(err.task) in the list $(err.t)")
+end
+
 """
-    AbstractConstrainedProduct{T,N}
+    AbstractConstrainedProduct{T, N, Q}
 
 Supertype of [`ProductSplit`](@ref) and [`ProductSection`](@ref).
 """
-abstract type AbstractConstrainedProduct{T,N} end
+abstract type AbstractConstrainedProduct{T, N, Q} end
 Base.eltype(::AbstractConstrainedProduct{T}) where {T} = T
-Base.ndims(::AbstractConstrainedProduct{<:Any,N}) where {N} = N
+
+_niterators(::AbstractConstrainedProduct{<:Any, N}) where {N} = N
+
+const IncreasingAbstractConstrainedProduct{T, N} =
+    AbstractConstrainedProduct{T, N, <:NTuple{N, AbstractUnitRange}}
 
 """
-    ProductSection{T,N,Q}
+    ProductSection{T, N, Q<:NTuple{N,AbstractRange}}
 
-Iterator that loops over a specified section of the 
-outer product of the ranges provided in 
-reverse-lexicographic order. The ranges need to be strictly
-increasing. Given `N` ranges, 
-each element returned by the iterator will be 
+Iterator that loops over a specified section of the
+outer product of ranges in. If the ranges are strictly increasing, the
+iteration will be in reverse - lexicographic order.
+Given `N` ranges, each element returned by the iterator will be
 a tuple of length `N` with one element from each range.
 
 See also: [`ProductSplit`](@ref)
 """
-struct ProductSection{T,N,Q} <: AbstractConstrainedProduct{T,N}
+struct ProductSection{T, N, Q <: NTuple{N,AbstractRange}} <: AbstractConstrainedProduct{T, N, Q}
     iterators :: Q
-    togglelevels :: NTuple{N,Int}
+    togglelevels :: NTuple{N, Int}
     firstind :: Int
     lastind :: Int
 
-    function ProductSection(iterators::Tuple{Vararg{AbstractRange,N}}, togglelevels::NTuple{N,Int},
+    function ProductSection(iterators::Tuple{Vararg{AbstractRange, N}}, togglelevels::NTuple{N, Int},
         firstind::Int, lastind::Int) where {N}
 
         # Ensure that all the iterators are strictly increasing
-        all(x->step(x)>0, iterators) || 
-        throw(ArgumentError("all the iterators need to be strictly increasing"))
+        all(x->step(x)>0, iterators) ||
+        throw(ArgumentError("all the ranges need to be strictly increasing"))
 
-        T = Tuple{eltype.(iterators)...}
+        T = Tuple{map(eltype, iterators)...}
 
-        new{T,N,typeof(iterators)}(iterators, togglelevels, firstind, lastind)
+        new{T, N, typeof(iterators)}(iterators, togglelevels, firstind, lastind)
     end
 end
 
 function _cumprod(len::Tuple)
-    (0,_cumprod(first(len),Base.tail(len))...)
+    (0, _cumprod(first(len), Base.tail(len))...)
 end
 
 _cumprod(::Integer,::Tuple{}) = ()
 function _cumprod(n::Integer, tl::Tuple)
-    (n,_cumprod(n*first(tl),Base.tail(tl))...)
+    (n, _cumprod(n*first(tl), Base.tail(tl))...)
+end
+
+function takedrop(ps::ProductSection)
+    drop = ps.firstind - 1
+    take = ps.lastind - ps.firstind + 1
+    Iterators.take(Iterators.drop(Iterators.product(ps.iterators...), drop), take)
 end
 
 """
@@ -56,10 +73,10 @@ specified by `inds`.
 
 # Examples
 ```jldoctest
-julia> p = ParallelUtilities.ProductSection((1:3,4:6), 5:8);
+julia> p = ParallelUtilities.ProductSection((1:3, 4:6), 5:8);
 
 julia> collect(p)
-4-element Array{Tuple{Int64,Int64},1}:
+4-element $(Vector{Tuple{Int, Int}}):
  (2, 5)
  (3, 5)
  (1, 6)
@@ -69,12 +86,10 @@ julia> collect(p) == collect(Iterators.product(1:3, 4:6))[5:8]
 true
 ```
 """
-function ProductSection(iterators::Tuple{AbstractRange,Vararg{AbstractRange}},
-    inds::AbstractUnitRange)
-
+function ProductSection(iterators::Tuple{Vararg{AbstractRange}}, inds::AbstractUnitRange)
     firstind, lastind = first(inds), last(inds)
 
-    len = length.(iterators)
+    len = map(length, iterators)
     Nel = prod(len)
     1 <= firstind || throw(
         ArgumentError("the range of indices must start from a number ≥ 1"))
@@ -83,59 +98,76 @@ function ProductSection(iterators::Tuple{AbstractRange,Vararg{AbstractRange}},
     togglelevels = _cumprod(len)
     ProductSection(iterators, togglelevels, firstind, lastind)
 end
-ProductSection(::Tuple{}, ::AbstractUnitRange) = throw(ArgumentError("Need at least one iterator"))
+ProductSection(::Tuple{}, ::AbstractUnitRange) = throw(ArgumentError("need at least one iterator"))
 
 """
-    ProductSplit{T,N,Q}
+    ProductSplit{T, N, Q<:NTuple{N,AbstractRange}}
 
-Iterator that loops over the outer product of ranges in 
-reverse-lexicographic order. The ranges need to be strictly
-increasing. Given `N` ranges, 
-each element returned by the iterator will be 
+Iterator that loops over a section of the outer product of ranges.
+If the ranges are strictly increasing, the iteration is in reverse - lexicographic order.
+Given `N` ranges, each element returned by the iterator will be
 a tuple of length `N` with one element from each range.
 
 See also: [`ProductSection`](@ref)
 """
-struct ProductSplit{T,N,Q <: ProductSection{T,N}} <: AbstractConstrainedProduct{T,N}
-    ps :: Q
+struct ProductSplit{T, N, Q<:NTuple{N, AbstractRange}} <: AbstractConstrainedProduct{T, N, Q}
+    ps :: ProductSection{T, N, Q}
     np :: Int
     p :: Int
 
-    function ProductSplit(ps::ProductSection, np::Integer, p::Integer)
+    function ProductSplit(ps::ProductSection{T, N, Q}, np::Integer, p::Integer) where {T, N, Q}
         1 <= p <= np || throw(ArgumentError("processor rank out of range"))
-        new{eltype(ps),ndims(ps),typeof(ps)}(ps, np, p)
+        new{T, N, Q}(ps, np, p)
     end
 end
 
+function nelementsdroptake(len, np, p)
+    d, r = divrem(len, np)
+    drop = d*(p - 1) + min(r, p - 1)
+    lastind = d*p + min(r, p)
+    take = lastind - drop
+    drop, take
+end
+
 """
     ProductSplit(iterators::Tuple{Vararg{AbstractRange}}, np::Integer, p::Integer)
 
-Construct a `ProductSplit` iterator that represents the outer product 
-of the iterators split over `np` workers, with this instance reprsenting 
+Construct a `ProductSplit` iterator that represents the outer product
+of the iterators split over `np` workers, with this instance reprsenting
 the values on the `p`-th worker.
 
+!!! note
+    `p` here refers to the rank of the worker, and is unrelated to the worker ID obtained by
+    executing `myid()` on that worker.
+
 # Examples
 ```jldoctest
-julia> ProductSplit((1:2,4:5), 2, 1) |> collect
-2-element Array{Tuple{Int64,Int64},1}:
+julia> ParallelUtilities.ProductSplit((1:2, 4:5), 2, 1) |> collect
+2-element $(Vector{Tuple{Int, Int}}):
  (1, 4)
  (2, 4)
 
-julia> ProductSplit((1:2,4:5), 2, 2) |> collect
-2-element Array{Tuple{Int64,Int64},1}:
+julia> ParallelUtilities.ProductSplit((1:2, 4:5), 2, 2) |> collect
+2-element $(Vector{Tuple{Int, Int}}):
  (1, 5)
  (2, 5)
 ```
 """
-function ProductSplit(iterators::Tuple{AbstractRange,Vararg{AbstractRange}}, np::Integer, p::Integer)
-    d,r = divrem(prod(length, iterators), np)
-    firstind = d*(p-1) + min(r,p-1) + 1
-    lastind = d*p + min(r,p)
+function ProductSplit(iterators::Tuple{Vararg{AbstractRange}}, np::Integer, p::Integer)
+    # d, r = divrem(prod(length, iterators), np)
+    # firstind = d*(p - 1) + min(r, p - 1) + 1
+    # lastind = d*p + min(r, p)
+    drop, take = nelementsdroptake(prod(length, iterators), np, p)
+    firstind = drop + 1
+    lastind = drop + take
     ProductSplit(ProductSection(iterators, firstind:lastind), np, p)
 end
 ProductSplit(::Tuple{}, ::Integer, ::Integer) = throw(ArgumentError("Need at least one iterator"))
 
+takedrop(ps::ProductSplit) = takedrop(ProductSection(ps))
+
 workerrank(ps::ProductSplit) = ps.p
+Distributed.nworkers(ps::ProductSplit) = ps.np
 
 ProductSection(ps::ProductSection) = ps
 ProductSection(ps::ProductSplit) = ps.ps
@@ -143,52 +175,37 @@ ProductSection(ps::ProductSplit) = ps.ps
 getiterators(ps::AbstractConstrainedProduct) = ProductSection(ps).iterators
 togglelevels(ps::AbstractConstrainedProduct) = ProductSection(ps).togglelevels
 
-function mwerepr(ps::ProductSplit)
-    "ProductSplit(" * repr(getiterators(ps)) * ", " * repr(ps.np) * ", " * repr(ps.p) * ")"
-end
-function mwerepr(ps::ProductSection)
-    "ProductSection(" * repr(getiterators(ps)) * ", " * repr(firstindexglobal(ps):lastindexglobal(ps)) * ")"
-end
 function Base.summary(io::IO, ps::AbstractConstrainedProduct)
-    print(io, length(ps),"-element ", mwerepr(ps))
-    if !isempty(ps)
-        print(io, "\n[", repr(first(ps)) * ", ... , " * repr(last(ps)), "]")
-    end
+    print(io, length(ps), "-element ", string(nameof(typeof(ps))))
 end
 function Base.show(io::IO, ps::AbstractConstrainedProduct)
-    print(io, summary(ps))
+    summary(io, ps)
+    if !isempty(ps)
+        print(io, " [", repr(first(ps)) * ", ... , " * repr(last(ps)), "]")
+    end
 end
 
-@deprecate ntasks(x::Tuple) prod(length, x)
-
-"""
-    ntasks(iterators::Tuple)
-
-The total number of elements in the outer product of the ranges contained in 
-`iterators`, equal to `prod(length, iterators)`
-"""
-ntasks
-ntasks(ps::AbstractConstrainedProduct) = ntasks(getiterators(ps))
-
 Base.isempty(ps::AbstractConstrainedProduct) = (firstindexglobal(ps) > lastindexglobal(ps))
 
 function Base.first(ps::AbstractConstrainedProduct)
-    isempty(ps) ? nothing : @inbounds _first(getiterators(ps), childindex(ps, firstindexglobal(ps))...)
+    isempty(ps) && throw(ArgumentError("collection must be non - empty"))
+    _first(getiterators(ps), childindex(ps, firstindexglobal(ps))...)
 end
 
-Base.@propagate_inbounds function _first(t::Tuple, ind::Integer, rest::Integer...)
-    @boundscheck (1 <= ind <= length(first(t))) || throw(BoundsError(first(t),ind))
-    (@inbounds first(t)[ind], _first(Base.tail(t), rest...)...)
+function _first(t::Tuple, ind::Integer, rest::Integer...)
+    (1 <= ind <= length(first(t))) || throw(BoundsError(first(t), ind))
+    (first(t)[ind], _first(Base.tail(t), rest...)...)
 end
 _first(::Tuple{}) = ()
 
 function Base.last(ps::AbstractConstrainedProduct)
-    isempty(ps) ? nothing : @inbounds _last(getiterators(ps), childindex(ps, lastindexglobal(ps))...)
+    isempty(ps) && throw(ArgumentError("collection must be non - empty"))
+    _last(getiterators(ps), childindex(ps, lastindexglobal(ps))...)
 end
 
-Base.@propagate_inbounds function _last(t::Tuple, ind::Integer, rest::Integer...)
-    @boundscheck (1 <= ind <= length(first(t))) || throw(BoundsError(first(t),ind))
-    (@inbounds first(t)[ind], _last(Base.tail(t), rest...)...)
+function _last(t::Tuple, ind::Integer, rest::Integer...)
+    (1 <= ind <= length(first(t))) || throw(BoundsError(first(t), ind))
+    (first(t)[ind], _last(Base.tail(t), rest...)...)
 end
 _last(::Tuple{}) = ()
 
@@ -204,17 +221,17 @@ lastindexglobal(ps::AbstractConstrainedProduct) = ProductSection(ps).lastind
     childindex(ps::AbstractConstrainedProduct, ind)
 
 Return a tuple containing the indices of the individual `AbstractRange`s
-corresponding to the element that is present at index `ind` in the 
+corresponding to the element that is present at index `ind` in the
 outer product of the ranges.
 
 !!! note
-    The index `ind` corresponds to the outer product of the ranges, and not to `ps`. 
+    The index `ind` corresponds to the outer product of the ranges, and not to `ps`.
 
 # Examples
 ```jldoctest
 julia> iters = (1:5, 2:4, 1:3);
 
-julia> ps = ProductSplit(iters, 7, 1);
+julia> ps = ParallelUtilities.ProductSplit(iters, 7, 1);
 
 julia> ind = 6;
 
@@ -231,13 +248,13 @@ See also: [`childindexshifted`](@ref)
 """
 function childindex(ps::AbstractConstrainedProduct, ind)
     tl = reverse(Base.tail(togglelevels(ps)))
-    reverse(childindex(tl,ind))
+    reverse(childindex(tl, ind))
 end
 
 function childindex(tl::Tuple, ind)
     t = first(tl)
     k = div(ind - 1, t)
-    (k+1, childindex(Base.tail(tl), ind - k*t)...)
+    (k + 1, childindex(Base.tail(tl), ind - k*t)...)
 end
 
 # First iterator gets the final remainder
@@ -246,17 +263,17 @@ childindex(::Tuple{}, ind) = (ind,)
 """
     childindexshifted(ps::AbstractConstrainedProduct, ind)
 
-Return a tuple containing the indices in the individual iterators 
-given an index of a `AbstractConstrainedProduct`.
+Return a tuple containing the indices in the individual iterators
+given an index of `ps`.
 
-If the ranges `(r1, r2, ...)` are used to generate
+If the iterators `(r1, r2, ...)` are used to generate
 `ps`, then return `(i1, i2, ...)` such that `ps[ind] == (r1[i1], r2[i2], ...)`.
 
 # Examples
 ```jldoctest
 julia> iters = (1:5, 2:4, 1:3);
 
-julia> ps = ProductSplit(iters, 7, 3);
+julia> ps = ParallelUtilities.ProductSplit(iters, 7, 3);
 
 julia> psind = 4;
 
@@ -273,67 +290,53 @@ function childindexshifted(ps::AbstractConstrainedProduct, ind)
     childindex(ps, (ind - 1) + firstindexglobal(ps))
 end
 
-Base.@propagate_inbounds function Base.getindex(ps::AbstractConstrainedProduct, ind)
-    @boundscheck 1 <= ind <= length(ps) || throw(BoundsError(ps,ind))
+function Base.getindex(ps::AbstractConstrainedProduct, ind)
+    1 <= ind <= length(ps) || throw(BoundsError(ps, ind))
     _getindex(ps, childindexshifted(ps, ind)...)
 end
-# This needs to be a separate function to deal with the case of a single child iterator, in which case 
+# This needs to be a separate function to deal with the case of a single child iterator, in which case
 # it's not clear if the single index is for the ProductSplit or the child iterator
 
 # This method asserts that the number of indices is correct
-Base.@propagate_inbounds function _getindex(ps::AbstractConstrainedProduct{<:Any,N},
-    inds::Vararg{Integer,N}) where {N}
-    
+function _getindex(ps::AbstractConstrainedProduct{<:Any, N}, inds::Vararg{Integer, N}) where {N}
     _getindex(getiterators(ps), inds...)
 end
 
-Base.@propagate_inbounds function _getindex(t::Tuple, ind::Integer, rest::Integer...)
-    @boundscheck (1 <= ind <= length(first(t))) || throw(BoundsError(first(t),ind))
-    (@inbounds first(t)[ind], _getindex(Base.tail(t), rest...)...)
+function _getindex(t::Tuple, ind::Integer, rest::Integer...)
+    (1 <= ind <= length(first(t))) || throw(BoundsError(first(t), ind))
+    (first(t)[ind], _getindex(Base.tail(t), rest...)...)
 end
 _getindex(::Tuple{}, ::Integer...) = ()
 
-function Base.iterate(ps::AbstractConstrainedProduct{T}, state=(first(ps), 1)) where {T}
-    el,n = state
-
-    if n > length(ps)
-        return nothing
-    elseif n == length(ps)
-        # In this case the next value doesn't matter, so just return something arbitary
-        next_state = (el::T, n+1)
-    else
-        next_state = (ps[n+1]::T, n+1)
-    end
-
-    (el::T, next_state)
+function Base.iterate(ps::AbstractConstrainedProduct, state...)
+    iterate(takedrop(ps), state...)
 end
 
-function _firstlastalongdim(ps::AbstractConstrainedProduct, dim,
+function _firstlastalongdim(ps::AbstractConstrainedProduct, dims,
     firstindchild::Tuple = childindex(ps, firstindexglobal(ps)),
     lastindchild::Tuple = childindex(ps, lastindexglobal(ps)))
 
-    iter = getiterators(ps)[dim]
+    iter = getiterators(ps)[dims]
 
-    fic = firstindchild[dim]
-    lic = lastindchild[dim]
+    fic = firstindchild[dims]
+    lic = lastindchild[dims]
 
     first_iter = iter[fic]
     last_iter = iter[lic]
 
-    (first_iter,last_iter)
+    (first_iter, last_iter)
 end
 
-function _checkrollover(ps::AbstractConstrainedProduct, dim,
+function _checkrollover(ps::AbstractConstrainedProduct, dims,
     firstindchild::Tuple = childindex(ps, firstindexglobal(ps)),
     lastindchild::Tuple = childindex(ps, lastindexglobal(ps)))
 
-    _checkrollover(getiterators(ps), dim, firstindchild, lastindchild)
+    _checkrollover(getiterators(ps), dims, firstindchild, lastindchild)
 end
 
-function _checkrollover(t::Tuple, dim, firstindchild::Tuple, lastindchild::Tuple)
-
-    if dim > 0
-        return _checkrollover(Base.tail(t), dim-1, Base.tail(firstindchild), Base.tail(lastindchild))
+function _checkrollover(t::Tuple, dims, firstindchild::Tuple, lastindchild::Tuple)
+    if dims > 0
+        return _checkrollover(Base.tail(t), dims - 1, Base.tail(firstindchild), Base.tail(lastindchild))
     end
 
     !_checknorollover(reverse(t), reverse(firstindchild), reverse(lastindchild))
@@ -344,27 +347,29 @@ function _checknorollover(t, firstindchild, lastindchild)
     first_iter = iter[first(firstindchild)]
     last_iter = iter[first(lastindchild)]
 
-    (last_iter == first_iter) & 
-        _checknorollover(Base.tail(t),Base.tail(firstindchild),Base.tail(lastindchild))
+    (last_iter == first_iter) &
+        _checknorollover(Base.tail(t), Base.tail(firstindchild), Base.tail(lastindchild))
 end
 _checknorollover(::Tuple{}, ::Tuple{}, ::Tuple{}) = true
 
-function _nrollovers(ps::AbstractConstrainedProduct, dim::Integer)
-    dim == ndims(ps) && return 0
-    nelements(ps; dim = dim + 1) - 1
+function _nrollovers(ps::AbstractConstrainedProduct, dims::Integer)
+    dims == _niterators(ps) && return 0
+    nelements(ps; dims = dims + 1) - 1
 end
 
 """
-    nelements(ps::AbstractConstrainedProduct; dim::Integer)
+    nelements(ps::AbstractConstrainedProduct{T, N, <:NTuple{N,AbstractUnitRange}}; dims::Integer) where {T,N}
+
+Compute the number of unique values in the section of the `dims`-th range contained in `ps`.
 
-Compute the number of unique values in the section of the `dim`-th range contained in `ps`.
+The function is defined currently only for iterator products of `AbstractUnitRange`s.
 
 # Examples
 ```jldoctest
-julia> ps = ProductSplit((1:5, 2:4, 1:3), 7, 3);
+julia> ps = ParallelUtilities.ProductSplit((1:5, 2:4, 1:3), 7, 3);
 
 julia> collect(ps)
-7-element Array{Tuple{Int64,Int64,Int64},1}:
+7-element $(Vector{Tuple{Int, Int, Int}}):
  (5, 4, 1)
  (1, 2, 2)
  (2, 2, 2)
@@ -373,38 +378,34 @@ julia> collect(ps)
  (5, 2, 2)
  (1, 3, 2)
 
-julia> ParallelUtilities.nelements(ps, dim = 1)
+julia> ParallelUtilities.nelements(ps, dims = 1)
 5
 
-julia> ParallelUtilities.nelements(ps, dim = 2)
+julia> ParallelUtilities.nelements(ps, dims = 2)
 3
 
-julia> ParallelUtilities.nelements(ps, dim = 3)
+julia> ParallelUtilities.nelements(ps, dims = 3)
 2
 ```
 """
-function nelements(ps::AbstractConstrainedProduct, dim::Integer)
-    Base.depwarn("nelements(ps, dim) is deprecated, please use nelements(ps, dim = dim)", :nelements)
-    nelements(ps, dim = dim)
-end
-function nelements(ps::AbstractConstrainedProduct; dim::Integer)
-    1 <= dim <= ndims(ps) || throw(ArgumentError("1 ⩽ dim ⩽ N=$(ndims(ps)) not satisfied for dim=$dim"))
+function nelements(ps::IncreasingAbstractConstrainedProduct; dims::Integer)
+    1 <= dims <= _niterators(ps) || throw(ArgumentError("1 ⩽ dims ⩽ N=$(_niterators(ps)) not satisfied for dims=$dims"))
 
-    iter = getiterators(ps)[dim]
+    iter = getiterators(ps)[dims]
 
-    if _nrollovers(ps,dim) == 0
-        st = first(ps)[dim]
-        en = last(ps)[dim]
-        stind = searchsortedfirst(iter,st)
-        enind = searchsortedfirst(iter,en)
+    if _nrollovers(ps, dims) == 0
+        st = first(ps)[dims]
+        en = last(ps)[dims]
+        stind = findfirst(isequal(st), iter)
+        enind = findfirst(isequal(en), iter)
         nel = length(stind:enind)
-    elseif _nrollovers(ps,dim) > 1
+    elseif _nrollovers(ps, dims) > 1
         nel = length(iter)
     else
-        st = first(ps)[dim]
-        en = last(ps)[dim]
-        stind = searchsortedfirst(iter,st)
-        enind = searchsortedfirst(iter,en)
+        st = first(ps)[dims]
+        en = last(ps)[dims]
+        stind = findfirst(isequal(st), iter)
+        enind = findfirst(isequal(en), iter)
         if stind > enind
             # some elements are missed out
             nel = length(stind:length(iter)) + length(1:enind)
@@ -417,57 +418,51 @@ end
 
 
 """
-    maximum(ps::AbstractConstrainedProduct; dim::Integer)
+    maximumelement(ps::AbstractConstrainedProduct; dims::Integer)
 
-Compute the maximum value of the section of the range number `dim` contained in `ps`.
+Compute the maximum value of the section of the range number `dims` contained in `ps`.
 
 # Examples
 ```jldoctest
-julia> ps = ProductSplit((1:2,4:5),2,1);
+julia> ps = ParallelUtilities.ProductSplit((1:2, 4:5), 2, 1);
 
 julia> collect(ps)
-2-element Array{Tuple{Int64,Int64},1}:
+2-element $(Vector{Tuple{Int, Int}}):
  (1, 4)
  (2, 4)
 
-julia> maximum(ps, dim = 1)
+julia> ParallelUtilities.maximumelement(ps, dims = 1)
 2
 
-julia> maximum(ps, dim = 2)
+julia> ParallelUtilities.maximumelement(ps, dims = 2)
 4
 ```
 """
-function Base.maximum(ps::AbstractConstrainedProduct; dim::Integer)
+function maximumelement(ps::IncreasingAbstractConstrainedProduct; dims::Integer)
+    isempty(ps) && throw(ArgumentError("collection must be non - empty"))
 
-    isempty(ps) && return nothing
-    
     firstindchild = childindex(ps, firstindexglobal(ps))
     lastindchild = childindex(ps, lastindexglobal(ps))
 
-    first_iter,last_iter = _firstlastalongdim(ps, dim, firstindchild, lastindchild)
+    _, last_iter = _firstlastalongdim(ps, dims, firstindchild, lastindchild)
 
     v = last_iter
 
     # The last index will not roll over so this can be handled easily
-    if dim == ndims(ps)
+    if dims == _niterators(ps)
         return v
     end
 
-    if _checkrollover(ps, dim, firstindchild, lastindchild)
-        iter = getiterators(ps)[dim]
+    if _checkrollover(ps, dims, firstindchild, lastindchild)
+        iter = getiterators(ps)[dims]
         v = maximum(iter)
     end
 
     return v
 end
 
-function Base.maximum(ps::AbstractConstrainedProduct, dim::Integer)
-    Base.depwarn("maximum(ps::AbstractConstrainedProduct, dim) is deprecated, use maximum(ps, dim = dim) instead", :maximum)
-    maximum(ps, dim = dim)
-end
-
-function Base.maximum(ps::AbstractConstrainedProduct{<:Any,1})
-    isempty(ps) && return nothing
+function maximumelement(ps::IncreasingAbstractConstrainedProduct{<:Any, 1})
+    isempty(ps) && throw(ArgumentError("range must be non - empty"))
     lastindchild = childindex(ps, lastindexglobal(ps))
     lic_dim = lastindchild[1]
     iter = getiterators(ps)[1]
@@ -475,57 +470,51 @@ function Base.maximum(ps::AbstractConstrainedProduct{<:Any,1})
 end
 
 """
-    minimum(ps::AbstractConstrainedProduct; dim::Integer)
+    minimumelement(ps::AbstractConstrainedProduct; dims::Integer)
 
-Compute the minimum value of the section of the range number `dim` contained in `ps`.
+Compute the minimum value of the section of the range number `dims` contained in `ps`.
 
 # Examples
 ```jldoctest
-julia> ps = ProductSplit((1:2, 4:5), 2, 1);
+julia> ps = ParallelUtilities.ProductSplit((1:2, 4:5), 2, 1);
 
 julia> collect(ps)
-2-element Array{Tuple{Int64,Int64},1}:
+2-element $(Vector{Tuple{Int, Int}}):
  (1, 4)
  (2, 4)
 
-julia> minimum(ps, dim = 1)
+julia> ParallelUtilities.minimumelement(ps, dims = 1)
 1
 
-julia> minimum(ps, dim = 2)
+julia> ParallelUtilities.minimumelement(ps, dims = 2)
 4
 ```
 """
-function Base.minimum(ps::AbstractConstrainedProduct; dim::Integer)
-    
-    isempty(ps) && return nothing
+function minimumelement(ps::IncreasingAbstractConstrainedProduct; dims::Integer)
+    isempty(ps) && throw(ArgumentError("collection must be non - empty"))
 
     firstindchild = childindex(ps, firstindexglobal(ps))
     lastindchild = childindex(ps, lastindexglobal(ps))
 
-    first_iter,last_iter = _firstlastalongdim(ps, dim, firstindchild, lastindchild)
+    first_iter, last_iter = _firstlastalongdim(ps, dims, firstindchild, lastindchild)
 
     v = first_iter
 
     # The last index will not roll over so this can be handled easily
-    if dim == ndims(ps)
+    if dims == _niterators(ps)
         return v
     end
 
-    if _checkrollover(ps, dim, firstindchild, lastindchild)
-        iter = getiterators(ps)[dim]
+    if _checkrollover(ps, dims, firstindchild, lastindchild)
+        iter = getiterators(ps)[dims]
         v = minimum(iter)
     end
 
     return v
 end
 
-function Base.minimum(ps::AbstractConstrainedProduct, dim::Integer)
-    Base.depwarn("minimum(ps::AbstractConstrainedProduct, dim) is deprecated, use minimum(ps, dim = dim) instead", :minimum)
-    minimum(ps, dim = dim)
-end
-
-function Base.minimum(ps::AbstractConstrainedProduct{<:Any,1})
-    isempty(ps) && return nothing
+function minimumelement(ps::IncreasingAbstractConstrainedProduct{<:Any, 1})
+    isempty(ps) && throw(ArgumentError("range must be non - empty"))
     firstindchild = childindex(ps, firstindexglobal(ps))
     fic_dim = firstindchild[1]
     iter = getiterators(ps)[1]
@@ -533,119 +522,117 @@ function Base.minimum(ps::AbstractConstrainedProduct{<:Any,1})
 end
 
 """
-    extrema(ps::AbstractConstrainedProduct; dim::Integer)
+    extremaelement(ps::AbstractConstrainedProduct; dims::Integer)
 
-Compute the `extrema` of the section of the range number `dim` contained in `ps`.
+Compute the `extrema` of the section of the range number `dims` contained in `ps`.
 
 # Examples
 ```jldoctest
-julia> ps = ProductSplit((1:2, 4:5), 2, 1);
+julia> ps = ParallelUtilities.ProductSplit((1:2, 4:5), 2, 1);
 
 julia> collect(ps)
-2-element Array{Tuple{Int64,Int64},1}:
+2-element $(Vector{Tuple{Int, Int}}):
  (1, 4)
  (2, 4)
 
-julia> extrema(ps, dim = 1)
+julia> ParallelUtilities.extremaelement(ps, dims = 1)
 (1, 2)
 
-julia> extrema(ps, dim = 2)
+julia> ParallelUtilities.extremaelement(ps, dims = 2)
 (4, 4)
 ```
 """
-function Base.extrema(ps::AbstractConstrainedProduct; dim::Integer)
-    
-    isempty(ps) && return nothing
+function extremaelement(ps::IncreasingAbstractConstrainedProduct; dims::Integer)
+    isempty(ps) && throw(ArgumentError("collection must be non - empty"))
 
     firstindchild = childindex(ps, firstindexglobal(ps))
     lastindchild = childindex(ps, lastindexglobal(ps))
 
-    first_iter,last_iter = _firstlastalongdim(ps, dim, firstindchild, lastindchild)
+    first_iter, last_iter = _firstlastalongdim(ps, dims, firstindchild, lastindchild)
 
-    v = (first_iter,last_iter)
+    v = (first_iter, last_iter)
     # The last index will not roll over so this can be handled easily
-    if dim == ndims(ps)
+    if dims == _niterators(ps)
         return v
     end
 
-    if _checkrollover(ps, dim, firstindchild, lastindchild)
-        iter = getiterators(ps)[dim]
+    if _checkrollover(ps, dims, firstindchild, lastindchild)
+        iter = getiterators(ps)[dims]
         v = extrema(iter)
     end
 
     return v
 end
 
-function Base.extrema(ps::AbstractConstrainedProduct{<:Any,1})
-    isempty(ps) && return nothing
+function extremaelement(ps::IncreasingAbstractConstrainedProduct{<:Any, 1})
+    isempty(ps) && throw(ArgumentError("collection must be non - empty"))
     firstindchild = childindex(ps, firstindexglobal(ps))
     lastindchild = childindex(ps, lastindexglobal(ps))
     fic_dim = firstindchild[1]
     lic_dim = lastindchild[1]
     iter = getiterators(ps)[1]
-    
+
     (iter[fic_dim], iter[lic_dim])
 end
 
-function Base.extrema(ps::AbstractConstrainedProduct, dim::Integer)
-    Base.depwarn("extrema(ps::AbstractConstrainedProduct, dim) is deprecated, use extrema(ps, dim = dim) instead", :extrema)
-    extrema(ps, dim = dim)
+for (f, g) in [(:maximumelement, :maximum), (:minimumelement, :minimum), (:extremaelement, :extrema)]
+    @eval $f(ps::AbstractConstrainedProduct{<:Any, 1}) = $g(first, takedrop(ps))
+    @eval $f(ps::AbstractConstrainedProduct; dims::Integer) = $g(x -> x[dims], takedrop(ps))
 end
 
 """
     extremadims(ps::AbstractConstrainedProduct)
 
-Compute the extrema of the sections of all the ranges contained in `ps`. 
-Functionally this is equivalent to 
+Compute the extrema of the sections of all the ranges contained in `ps`.
+Functionally this is equivalent to
 
 ```julia
-map(i -> extrema(ps, dim = i), 1:ndims(ps))
+map(i -> extrema(ps, dims = i), 1:_niterators(ps))
 ```
 
-but it is implemented more efficiently. 
+but it is implemented more efficiently.
 
-Returns a `Tuple` containing the `(min,max)` pairs along each 
+Returns a `Tuple` containing the `(min, max)` pairs along each
 dimension, such that the `i`-th index of the result contains the `extrema` along the section of the `i`-th range
 contained locally.
 
 # Examples
 ```jldoctest
-julia> ps = ProductSplit((1:2, 4:5), 2, 1);
+julia> ps = ParallelUtilities.ProductSplit((1:2, 4:5), 2, 1);
 
 julia> collect(ps)
-2-element Array{Tuple{Int64,Int64},1}:
+2-element $(Vector{Tuple{Int, Int}}):
  (1, 4)
  (2, 4)
 
-julia> extremadims(ps)
+julia> ParallelUtilities.extremadims(ps)
 ((1, 2), (4, 4))
 ```
 """
 function extremadims(ps::AbstractConstrainedProduct)
-    Base.depwarn("extremadims will not be exported in a future release, please call it as ParallelUtilities.extremadims instead", :extremadims)
     _extremadims(ps, 1, getiterators(ps))
 end
 
-function _extremadims(ps::AbstractConstrainedProduct, dim::Integer, iterators::Tuple)
-    (extrema(ps; dim = dim), _extremadims(ps, dim+1, Base.tail(iterators))...)
+function _extremadims(ps::AbstractConstrainedProduct, dims::Integer, iterators::Tuple)
+    (extremaelement(ps; dims = dims), _extremadims(ps, dims + 1, Base.tail(iterators))...)
 end
 _extremadims(::AbstractConstrainedProduct, ::Integer, ::Tuple{}) = ()
 
 """
-    extrema_commonlastdim(ps::AbstractConstrainedProduct)
+    extrema_commonlastdim(ps::AbstractConstrainedProduct{T, N, <:NTuple{N,AbstractUnitRange}}) where {T,N}
 
-Return the reverse-lexicographic extrema of values taken from 
-ranges contained in `ps`, where the pairs of ranges are constructed 
+Return the reverse - lexicographic extrema of values taken from
+ranges contained in `ps`, where the pairs of ranges are constructed
 by concatenating the ranges along each dimension with the last one.
 
 For two ranges this simply returns `([first(ps)], [last(ps)])`.
 
 # Examples
 ```jldoctest
-julia> ps = ProductSplit((1:3,4:7,2:7), 10, 2);
+julia> ps = ParallelUtilities.ProductSplit((1:3, 4:7, 2:7), 10, 2);
 
 julia> collect(ps)
-8-element Array{Tuple{Int64,Int64,Int64},1}:
+8-element $(Vector{Tuple{Int, Int, Int}}):
  (3, 6, 2)
  (1, 7, 2)
  (2, 7, 2)
@@ -655,66 +642,50 @@ julia> collect(ps)
  (3, 4, 3)
  (1, 5, 3)
 
-julia> extrema_commonlastdim(ps)
-([(1, 2), (6, 2)], [(3, 3), (5, 3)])
+julia> ParallelUtilities.extrema_commonlastdim(ps)
+$((Tuple{Int,Int}[(1, 2), (6, 2)], Tuple{Int,Int}[(3, 3), (5, 3)]))
 ```
 """
-function extrema_commonlastdim(ps::AbstractConstrainedProduct{<:Any,N}) where {N}
-
-    Base.depwarn("extrema_commonlastdim will not be exported in a future release, please call it as ParallelUtilities.extrema_commonlastdim instead", :extrema_commonlastdim)
-
+function extrema_commonlastdim(ps::IncreasingAbstractConstrainedProduct)
     isempty(ps) && return nothing
-    
+
     m = extremadims(ps)
-    lastvar_min = last(m)[1]
-    lastvar_max = last(m)[2]
+    lastvar_min, lastvar_max = last(m)
 
     val_first = first(ps)
     val_last = last(ps)
-    min_vals = collect(val_first[1:end-1])
-    max_vals = collect(val_last[1:end-1])
+    min_vals = collect(Base.front(val_first))
+    max_vals = collect(Base.front(val_last))
 
     for val in ps
         val_rev = reverse(val)
         lastvar = first(val_rev)
         (lastvar_min < lastvar < lastvar_max) && continue
 
-        for (ind,vi) in enumerate(Base.tail(val_rev))
-            if lastvar==lastvar_min
-                min_vals[N-ind] = min(min_vals[N-ind],vi)
+        for (ind, vi) in enumerate(Base.tail(val_rev))
+            if lastvar == lastvar_min
+                min_vals[_niterators(ps) - ind] = min(min_vals[_niterators(ps) - ind], vi)
             end
-            if lastvar==lastvar_max
-                max_vals[N-ind] = max(max_vals[N-ind],vi)
+            if lastvar == lastvar_max
+                max_vals[_niterators(ps) - ind] = max(max_vals[_niterators(ps) - ind], vi)
             end
         end
     end
 
-    [(m,lastvar_min) for m in min_vals],[(m,lastvar_max) for m in max_vals]
+    [(m, lastvar_min) for m in min_vals], [(m, lastvar_max) for m in max_vals]
 end
 
-_infullrange(val::T, ps::AbstractConstrainedProduct{T}) where {T} = _infullrange(val,getiterators(ps))
+_infullrange(val::T, ps::AbstractConstrainedProduct{T}) where {T} = _infullrange(val, getiterators(ps))
 
 function _infullrange(val, t::Tuple)
-    first(val) in first(t) && _infullrange(Base.tail(val),Base.tail(t))
+    first(val) in first(t) && _infullrange(Base.tail(val), Base.tail(t))
 end
 _infullrange(::Tuple{}, ::Tuple{}) = true
 
-function c2l_rec(iprev, nprev, ax, inds)
-    i = searchsortedfirst(ax[1],inds[1])
-    inew = iprev + (i-1)*nprev
-    n = nprev*length(ax[1])
-    c2l_rec(inew, n, Base.tail(ax), Base.tail(inds))
-end
-
-c2l_rec(i, n, ::Tuple{}, ::Tuple{}) = i
-
-_cartesiantolinear(ax, inds) = c2l_rec(1,1,ax,inds)
-
 """
-    indexinproduct(iterators::NTuple{N,AbstractRange}, val::NTuple{N,Any}) where {N}
+    indexinproduct(iterators::NTuple{N, AbstractRange}, val::NTuple{N, Any}) where {N}
 
-Return the index of `val` in the outer product of `iterators`, 
-where `iterators` is a `Tuple` of increasing `AbstractRange`s. 
+Return the index of `val` in the outer product of `iterators`.
 Return nothing if `val` is not present.
 
 # Examples
@@ -723,33 +694,36 @@ julia> iterators = (1:4, 1:3, 3:5);
 
 julia> val = (2, 2, 4);
 
-julia> ind = ParallelUtilities.indexinproduct(iterators,val)
+julia> ind = ParallelUtilities.indexinproduct(iterators, val)
 18
 
 julia> collect(Iterators.product(iterators...))[ind] == val
 true
 ```
 """
-function indexinproduct(iterators::Tuple{Vararg{AbstractRange,N}},
-    val::Tuple{Vararg{Any,N}}) where {N}
-
-    all(in.(val,iterators)) || return nothing
+function indexinproduct(iterators::NTuple{N, AbstractRange}, val::Tuple{Vararg{Any, N}}) where {N}
+    all(map(in, val, iterators)) || return nothing
 
-    ax = axes.(iterators,1)
-    individual_inds = searchsortedfirst.(iterators,val)
+    ax = map(x -> 1:length(x), iterators)
+    individual_inds = map((it, val) -> findfirst(isequal(val), it), iterators, val)
 
-    _cartesiantolinear(ax, individual_inds)
+    LinearIndices(ax)[individual_inds...]
 end
 
-indexinproduct(::Tuple{}, ::Tuple) = throw(ArgumentError("need at least one iterator"))
+indexinproduct(::Tuple{}, ::Tuple{}) = throw(ArgumentError("need at least one iterator"))
 
 function Base.in(val::T, ps::AbstractConstrainedProduct{T}) where {T}
-    _infullrange(val,ps) || return false
-    
+    _infullrange(val, ps) || return false
+
     ind = indexinproduct(getiterators(ps), val)
     firstindexglobal(ps) <= ind <= lastindexglobal(ps)
 end
 
+function Base.in(val::T, ps::IncreasingAbstractConstrainedProduct{T}) where {T}
+    _infullrange(val, ps) || return false
+    ReverseLexicographicTuple(first(ps)) <= ReverseLexicographicTuple(val) <= ReverseLexicographicTuple(last(ps))
+end
+
 # This struct is just a wrapper to flip the tuples before comparing
 struct ReverseLexicographicTuple{T<:Tuple}
     t :: T
@@ -759,10 +733,10 @@ Base.isless(a::ReverseLexicographicTuple{T}, b::ReverseLexicographicTuple{T}) wh
 Base.isequal(a::ReverseLexicographicTuple, b::ReverseLexicographicTuple) = a.t == b.t
 
 """
-    whichproc(iterators::Tuple, val::Tuple, np::Integer)
+    whichproc(iterators::Tuple{Vararg{AbstractRange}}, val::Tuple, np::Integer)
 
-Return the processor rank that will contain `val` if the outer 
-product of the ranges contained in `iterators` is split evenly 
+Return the processor rank that will contain `val` if the outer
+product of the ranges contained in `iterators` is split evenly
 across `np` processors.
 
 # Examples
@@ -771,35 +745,34 @@ julia> iters = (1:4, 2:3);
 
 julia> np = 2;
 
-julia> ProductSplit(iters, np, 2) |> collect
-4-element Array{Tuple{Int64,Int64},1}:
+julia> ParallelUtilities.ProductSplit(iters, np, 2) |> collect
+4-element $(Vector{Tuple{Int, Int}}):
  (1, 3)
  (2, 3)
  (3, 3)
  (4, 3)
 
-julia> whichproc(iters, (2,3), np)
+julia> ParallelUtilities.whichproc(iters, (2, 3), np)
 2
-``` 
+```
 """
-function whichproc(iterators, val, np::Integer)
-    
-    _infullrange(val,iterators) || return nothing
+function whichproc(iterators::Tuple{AbstractRange, Vararg{AbstractRange}}, val, np::Integer)
+    _infullrange(val, iterators) || return nothing
     np >= 1 || throw(ArgumentError("np must be >= 1"))
     np  == 1 && return 1
 
     # We may carry out a binary search as the iterators are sorted
-    left,right = 1,np
+    left, right = 1, np
 
     val_t = ReverseLexicographicTuple(val)
 
     while left < right
-        mid = div(left+right, 2)
+        mid = div(left + right, 2)
         ps = ProductSplit(iterators, np, mid)
 
         # If np is greater than the number of ntasks then it's possible
         # that ps is empty. In this case the value must be somewhere in
-        # the previous workers. Otherwise each worker has some tasks and 
+        # the previous workers. Otherwise each worker has some tasks and
         # these are sorted, so carry out a binary search
 
         if isempty(ps) || val_t < ReverseLexicographicTuple(first(ps))
@@ -814,71 +787,64 @@ function whichproc(iterators, val, np::Integer)
     return left
 end
 
-whichproc(iterators, ::Nothing, np::Integer) = nothing
-
 whichproc(ps::ProductSplit, val) = whichproc(getiterators(ps), val, ps.np)
 
 # This function tells us the range of processors that would be involved
 # if we are to compute the tasks contained in the list ps on np_new processors.
-# The total list of tasks is contained in iterators, and might differ from 
+# The total list of tasks is contained in iterators, and might differ from
 # getiterators(ps) (eg if ps contains a subsection of the parameter set)
 """
-    procrange_recast(iterators::Tuple, ps::ProductSplit, np_new::Integer)
+    procrange_recast(iterators::Tuple{Vararg{AbstractRange}}, ps, np_new::Integer)
 
-Return the range of processor ranks that would contain the values in `ps` if 
-the outer produce of the ranges in `iterators` is split across `np_new` 
+Return the range of processor ranks that would contain the values in `ps` if
+the outer produce of the ranges in `iterators` is split across `np_new`
 workers.
 
-The values contained in `ps` should be a subsection of the outer product of 
+The values contained in `ps` should be a subsection of the outer product of
 the ranges in `iterators`.
 
 # Examples
 ```jldoctest
 julia> iters = (1:10, 4:6, 1:4);
 
-julia> ps = ProductSplit(iters, 5, 2);
+julia> ps = ParallelUtilities.ProductSplit(iters, 5, 2);
 
-julia> procrange_recast(iters, ps, 10)
+julia> ParallelUtilities.procrange_recast(iters, ps, 10)
 3:4
 ```
 """
-function procrange_recast(iterators::Tuple, ps::AbstractConstrainedProduct, np_new::Integer)
-    
-    Base.depwarn("procrange_recast will not be exported in a future release, please call it as ParallelUtilities.procrange_recast instead", :procrange_recast)
-
-    if isempty(ps)
-        return 0:-1 # empty range
-    end
+function procrange_recast(iterators::Tuple{AbstractRange, Vararg{AbstractRange}}, ps::AbstractConstrainedProduct, np_new::Integer)
+    isempty(ps) && return nothing
 
-    procid_start = whichproc(iterators,first(ps),np_new)
+    procid_start = whichproc(iterators, first(ps), np_new)
     if procid_start === nothing
-        throw(TaskNotPresentError(iterators,first(ps)))
+        throw(TaskNotPresentError(iterators, first(ps)))
     end
     if length(ps) == 1
         procid_end = procid_start
     else
-        procid_end = whichproc(iterators,last(ps),np_new)
+        procid_end = whichproc(iterators, last(ps), np_new)
         if procid_end === nothing
-            throw(TaskNotPresentError(iterators,last(ps)))
+            throw(TaskNotPresentError(iterators, last(ps)))
         end
     end
-    
+
     return procid_start:procid_end
 end
 
 """
     procrange_recast(ps::AbstractConstrainedProduct, np_new::Integer)
 
-Return the range of processor ranks that would contain the values in `ps` if the 
+Return the range of processor ranks that would contain the values in `ps` if the
 iterators used to construct `ps` were split across `np_new` processes.
 
 # Examples
 ```jldoctest
 julia> iters = (1:10, 4:6, 1:4);
 
-julia> ps = ProductSplit(iters, 5, 2); # split across 5 processes initially
+julia> ps = ParallelUtilities.ProductSplit(iters, 5, 2); # split across 5 processes initially
 
-julia> procrange_recast(ps, 10) # If `iters` were spread across 10 processes
+julia> ParallelUtilities.procrange_recast(ps, 10) # If `iters` were spread across 10 processes
 3:4
 ```
 """
@@ -894,40 +860,31 @@ is not found.
 
 # Examples
 ```jldoctest
-julia> ps = ProductSplit((1:3, 4:5:20), 3, 2);
+julia> ps = ParallelUtilities.ProductSplit((1:3, 4:5:20), 3, 2);
 
 julia> collect(ps)
-4-element Array{Tuple{Int64,Int64},1}:
+4-element $(Vector{Tuple{Int, Int}}):
  (2, 9)
  (3, 9)
  (1, 14)
  (2, 14)
 
-julia> localindex(ps, (3,9))
+julia> ParallelUtilities.localindex(ps, (3, 9))
 2
 ```
 """
 function localindex(ps::AbstractConstrainedProduct{T}, val::T) where {T}
-
     (isempty(ps) || val ∉ ps) && return nothing
 
     indflat = indexinproduct(getiterators(ps), val)
     indflat - firstindexglobal(ps) + 1
 end
 
-# this is only needed because first and last return nothing if the ProductSplit is empty
-localindex(::AbstractConstrainedProduct, ::Nothing) = nothing
-
-function localindex(iterators::Tuple, val::Tuple, np::Integer, p::Integer)
-    ps = ProductSplit(iterators, np, p)
-    localindex(ps, val)
-end
-
 """
-    whichproc_localindex(iterators::Tuple, val::Tuple, np::Integer)
+    whichproc_localindex(iterators::Tuple{Vararg{AbstractRange}}, val::Tuple, np::Integer)
 
-Return `(rank,ind)`, where `rank` is the
-rank of the worker that `val` will reside on if the outer product 
+Return `(rank, ind)`, where `rank` is the
+rank of the worker that `val` will reside on if the outer product
 of the ranges in `iterators` is spread over `np` workers, and `ind` is
 the index of `val` in the local section on that worker.
 
@@ -937,37 +894,37 @@ julia> iters = (1:4, 2:8);
 
 julia> np = 10;
 
-julia> whichproc_localindex(iters, (2,4), np)
+julia> ParallelUtilities.whichproc_localindex(iters, (2, 4), np)
 (4, 1)
 
-julia> ProductSplit(iters, np, 4) |> collect
-3-element Array{Tuple{Int64,Int64},1}:
+julia> ParallelUtilities.ProductSplit(iters, np, 4) |> collect
+3-element $(Vector{Tuple{Int, Int}}):
  (2, 4)
  (3, 4)
  (4, 4)
 ```
 """
-function whichproc_localindex(iterators::Tuple, val::Tuple, np::Integer)
+function whichproc_localindex(iterators::Tuple{Vararg{AbstractRange}}, val::Tuple, np::Integer)
     procid = whichproc(iterators, val, np)
-    index = localindex(iterators, val, np, procid)
+    index = localindex(ProductSplit(iterators, np, procid), val)
     return procid, index
 end
 
 #################################################################
 
 """
-    dropleading(ps::AbstractConstrainedProduct)
+    dropleading(ps::AbstractConstrainedProduct{T, N, NTuple{N,AbstractUnitRange}}) where {T,N}
 
-Return a `ProductSection` leaving out the first iterator contained in `ps`. 
-The range of values of the remaining iterators in the 
+Return a `ProductSection` leaving out the first iterator contained in `ps`.
+The range of values of the remaining iterators in the
 resulting `ProductSection` will be the same as in `ps`.
 
 # Examples
 ```jldoctest
-julia> ps = ProductSplit((1:5, 2:4, 1:3), 7, 3);
+julia> ps = ParallelUtilities.ProductSplit((1:5, 2:4, 1:3), 7, 3);
 
 julia> collect(ps)
-7-element Array{Tuple{Int64,Int64,Int64},1}:
+7-element $(Vector{Tuple{Int, Int, Int}}):
  (5, 4, 1)
  (1, 2, 2)
  (2, 2, 2)
@@ -977,18 +934,18 @@ julia> collect(ps)
  (1, 3, 2)
 
 julia> ParallelUtilities.dropleading(ps) |> collect
-3-element Array{Tuple{Int64,Int64},1}:
+3-element $(Vector{Tuple{Int, Int}}):
  (4, 1)
  (2, 2)
  (3, 2)
 ```
 """
-function dropleading(ps::AbstractConstrainedProduct)
+function dropleading(ps::IncreasingAbstractConstrainedProduct)
     isempty(ps) && throw(ArgumentError("need at least one iterator"))
     iterators = Base.tail(getiterators(ps))
     first_element = Base.tail(first(ps))
     last_element = Base.tail(last(ps))
     firstind = indexinproduct(iterators, first_element)
     lastind = indexinproduct(iterators, last_element)
-    ProductSection(iterators,firstind:lastind)
-end
\ No newline at end of file
+    ProductSection(iterators, firstind:lastind)
+end
diff --git a/src/reductionfunctions.jl b/src/reductionfunctions.jl
index c98a94a..0ce8e37 100644
--- a/src/reductionfunctions.jl
+++ b/src/reductionfunctions.jl
@@ -1,200 +1,269 @@
-throw_dimserror(dims::Integer, N) = throw(ArgumentError("dims = $dims does not satisfy 1 <= dims <= $N"))
-throw_dimserror(dims, N) = throw(ArgumentError("dims = $dims does not satisfy 1 <= dims <= $N for all elements"))
-
-throw_axesmismatcherror(dim, axexp, axrcv) = throw(
-	DimensionMismatch("axes mismatch in dimension $dim, expected $axexp but received $axrcv"))
-
-function _checkdims(A, dim, ax_exp)
-	for a in A
-		axadim = axes(a, dim)
-		if axadim != ax_exp
-			throw_axesmismatcherror(dim, ax_exp, axadim)
-		end
-	end
-end
+"""
+    Commutative
 
-function checkdims(A, dims)
-	for (dim, ax_exp) in enumerate(axes(first(A)))
-		if dim ∉ dims
-			_checkdims(A, dim, ax_exp)
-		end
-	end
+Declare a reduction operator to be commutative in its arguments.
+No check is performed to ascertain if the operator is indeed commutative.
+"""
+struct Commutative{F} <: Function
+    f :: F
 end
 
-function checkdims(A, d::Integer)
-	for (dim, ax_exp) in enumerate(axes(first(A)))
-		dim == d && continue
-		_checkdims(A, dim, ax_exp)
-	end
-end
+(c::Commutative)(x, y) = c.f(x, y)
 
 """
-	sumcat_aligned(A::AbstractArray{T,N}...; dims) where {T,N}
+    BroadcastFunction(f)
 
-Concatenate the arrays along the dimensions `dims` according to their axes, 
-with overlapping sections being summed over. Returns an `OffsetArray` with the minimal 
-axis span encompassing all the arrays.
+Construct a binary function that evaluates `f.(x, y)` given the arguments `x` and `y`.
 
-`dims` may be an `Integer` or a collection of `Integer`s, but all elements of `dims` must be from the range `1:N`.
+!!! note
+    The function `BroadcastFunction(f)` is equivalent to `Base.BroadcastFunction(f)` on Julia versions
+    1.6 and above.
 
 # Examples
 ```jldoctest
-julia> ParallelUtilities.sumcat_aligned(ones(1:2), ones(4:5), dims=1)
-5-element OffsetArray(::Array{Float64,1}, 1:5) with eltype Float64 with indices 1:5:
- 1.0
- 1.0
- 0.0
- 1.0
- 1.0
-
-julia> ParallelUtilities.sumcat_aligned(ones(1:2, 1:2), ones(2:3, 2:3), dims=(1,2))
-3×3 OffsetArray(::Array{Float64,2}, 1:3, 1:3) with eltype Float64 with indices 1:3×1:3:
- 1.0  1.0  0.0
- 1.0  2.0  1.0
- 0.0  1.0  1.0
-
-julia> ParallelUtilities.sumcat_aligned(ones(1:2, 1:2), ones(3:4, 3:4), dims=(1,2))
-4×4 OffsetArray(::Array{Float64,2}, 1:4, 1:4) with eltype Float64 with indices 1:4×1:4:
- 1.0  1.0  0.0  0.0
- 1.0  1.0  0.0  0.0
- 0.0  0.0  1.0  1.0
- 0.0  0.0  1.0  1.0
+julia> ParallelUtilities.BroadcastFunction(+)(ones(3), ones(3))
+3-element $(Vector{Float64}):
+ 2.0
+ 2.0
+ 2.0
 ```
+"""
+struct BroadcastFunction{V, F} <: Function
+    f :: F
+end
+
+BroadcastFunction{V}(f) where {V} = BroadcastFunction{V, typeof(f)}(f)
+BroadcastFunction(f::Function) = BroadcastFunction{Nothing, typeof(f)}(f)
+
+(o::BroadcastFunction{Nothing})(x, y) = o.f.(x, y)
+
+(o::BroadcastFunction{1})(x, y) = broadcast!(o.f, x, x, y)
+(o::BroadcastFunction{2})(x, y) = broadcast!(o.f, y, x, y)
 
-See also: [`sumhcat_aligned`](@ref), [`sumvcat_aligned`](@ref)
 """
-function sumcat_aligned(A::AbstractArray{T,N}...; dims) where {T,N}
+    broadcastinplace(f, ::Val{N}) where {N}
+
+Construct a binary operator that evaluates `f.(x, y)` and overwrites the `N`th argument with the result.
+For `N == 1` this evaluates `x .= f.(x, y)`, whereas for `N == 2` this evaluates `y .= f.(x, y)`.
 
-	all(x -> 1 <= x <= N, dims) || throw_dimserror(dims, N)
+# Examples
 
-	checkdims(A, dims)
+```jldoctest
+julia> op = ParallelUtilities.broadcastinplace(+, Val(1));
 
-	ax = Vector{UnitRange{Int}}(undef, N)
-	ax .= axes(first(A))
+julia> x = ones(3); y = ones(3);
 
-	for d in dims
-		axmin = minimum(minimum.(axes.(A, d)))
-		axmax = maximum(maximum.(axes.(A, d)))
-		ax[d] = axmin:axmax
-	end
-	
-	arr = OffsetArray{T,N}(undef, ax...)
-	fill!(arr, zero(T))
+julia> op(x, y)
+3-element $(Vector{Float64}):
+ 2.0
+ 2.0
+ 2.0
 
-	for a in A
-		arr[axes(a)...] .+= a
-	end
-	arr
+julia> x # overwritten
+3-element $(Vector{Float64}):
+ 2.0
+ 2.0
+ 2.0
+```
+"""
+function broadcastinplace(f, v::Val{N}) where {N}
+    BroadcastFunction{N}(f)
 end
 
-sumcat_aligned(A1::AbstractArray; dims) = (all(x -> 1 <= x <= ndims(A1), dims) || throw_dimserror(dims); A1)
+"""
+    elementwisesum!(x, y)
 
+Binary reduction operator that performs an elementwise product and stores the result inplace in `x`.
+The value of `x` is overwritten in the process.
+
+Functionally `elementwisesum!(x, y)` is equivalent to `x .= x .+ y`.
+
+!!! note
+    The operator is assumed to be commutative.
 """
-	sumvcat_aligned(A::AbstractArray{T,N}...) where {T,N}
+const elementwisesum! = Commutative(broadcastinplace(+, Val(1)))
 
-Concatenate the arrays along the first dimension according to their axes, 
-with overlapping sections being summed over. Returns an `OffsetArray` with the minimal 
-axis span encompassing all the arrays.
+"""
+    elementwiseproduct!(x, y)
 
-The input arrays must be at least one-dimensional.
+Binary reduction operator that performs an elementwise product and stores the result inplace in `x`.
+The value of `x` is overwritten in the process.
 
-# Examples
-```jldoctest
-julia> ParallelUtilities.sumvcat_aligned(ones(1:2), ones(4:5))
-5-element OffsetArray(::Array{Float64,1}, 1:5) with eltype Float64 with indices 1:5:
- 1.0
- 1.0
- 0.0
- 1.0
- 1.0
-
-julia> ParallelUtilities.sumvcat_aligned(ones(1:2, 1:2), ones(2:3, 1:2))
-3×2 OffsetArray(::Array{Float64,2}, 1:3, 1:2) with eltype Float64 with indices 1:3×1:2:
- 1.0  1.0
- 2.0  2.0
- 1.0  1.0
-```
+Functionally `elementwiseproduct!(x, y)` is equivalent to `x .= x .* y`.
 
-See also: [`sumcat_aligned`](@ref), [`sumhcat_aligned`](@ref)
+!!! note
+    The operator is assumed to be commutative.
 """
-function sumvcat_aligned(A::AbstractArray{T,N}...) where {T,N}
+const elementwiseproduct! = Commutative(broadcastinplace(*, Val(1)))
 
-	N >= 1 || throw(ArgumentError("all the arrays need to have at least 1 dimension"))
-	checkdims(A, 1)
+"""
+    elementwisemin!(x, y)
 
-	axmin = minimum(minimum.(axes.(A, 1)))
-	axmax = maximum(maximum.(axes.(A, 1)))
-	
-	axcat = axmin:axmax
+Binary reduction operator that performs an elementwise `min` and stores the result inplace in `x`.
+The value of `x` is overwritten in the process.
 
-	trailing_axes = Base.tail(axes(first(A)))
-	
-	arr = OffsetArray{T,N}(undef, axcat, trailing_axes...)
-	fill!(arr, zero(T))
+Functionally `elementwisemin!(x, y)` is equivalent to `x .= min.(x, y)`.
 
-	for axt in CartesianIndices(trailing_axes)
-		for a in A, ind1 in axes(a,1)
-			arr[ind1, axt] += a[ind1, axt]
-		end
-	end
+!!! note
+    The operator is assumed to be commutative.
+"""
+const elementwisemin! = Commutative(broadcastinplace(min, Val(1)))
 
-	arr
-end
+"""
+    elementwisemax!(x, y)
 
-function sumvcat_aligned(A::AbstractArray)
-	ndims(A) >= 1 || throw(ArgumentError("the array needs to have at least 1 dimension"))
-	A
-end
+Binary reduction operator that performs an elementwise `max` and stores the result inplace in `x`.
+The value of `x` is overwritten in the process.
+
+Functionally `elementwisemax!(x, y)` is equivalent to `x .= max.(x, y)`.
+
+!!! note
+    The operator is assumed to be commutative.
+"""
+const elementwisemax! = Commutative(broadcastinplace(max, Val(1)))
 
 """
-	sumhcat_aligned(A::AbstractArray{T,N}...) where {T,N}
+    BroadcastStack(f, dims)(x::AbstractArray, y::AbstractArray)
 
-Concatenate the arrays along the second dimension according to their axes, 
-with overlapping sections being summed over. Returns an `OffsetArray` with the minimal 
-axis span encompassing all the arrays. 
+Construct a binary function that stacks its arguments along `dims`, with overlapping indices `I` being replaced by
+`f(x[I], y[I])`. The arguments `x` and `y` must both be `n`-dimensional arrays that have identical axes along all dimensions
+aside from those specified by `dims`. The axes of the result along each dimensions `d`
+in `dims` would be `union(axes(x, d), axes(y, d))`.
+Along the other dimensions the result has the same axes as `x` and `y`.
 
-The input arrays must be at least two-dimensional.
+!!! note
+    If the resulting axes along the concatenated dimensions are not 1-based, one might require an offset array package
+    such as [`OffsetArrays.jl`](https://github.com/JuliaArrays/OffsetArrays.jl).
 
 # Examples
 ```jldoctest
-julia> ParallelUtilities.sumhcat_aligned(ones(2, 1:2), ones(2, 4:5))
-2×5 OffsetArray(::Array{Float64,2}, 1:2, 1:5) with eltype Float64 with indices 1:2×1:5:
- 1.0  1.0  0.0  1.0  1.0
- 1.0  1.0  0.0  1.0  1.0
-
-julia> ParallelUtilities.sumhcat_aligned(ones(1:2, 1:2), ones(1:2, 2:3))
-2×3 OffsetArray(::Array{Float64,2}, 1:2, 1:3) with eltype Float64 with indices 1:2×1:3:
- 1.0  2.0  1.0
- 1.0  2.0  1.0
+julia> A = ones(2)*2
+2-element $(Vector{Float64}):
+ 2.0
+ 2.0
+
+julia> B = ones(3)*3
+3-element $(Vector{Float64}):
+ 3.0
+ 3.0
+ 3.0
+
+julia> ParallelUtilities.BroadcastStack(min, 1)(A, B)
+3-element $(Vector{Float64}):
+ 2.0
+ 2.0
+ 3.0
+
+julia> A = ones(2,2)*2
+2×2 $(Matrix{Float64}):
+ 2.0  2.0
+ 2.0  2.0
+
+julia> B = ones(2,3)*3
+2×3 $(Matrix{Float64}):
+ 3.0  3.0  3.0
+ 3.0  3.0  3.0
+
+julia> ParallelUtilities.BroadcastStack(+, 2)(A, B)
+2×3 $(Matrix{Float64}):
+ 5.0  5.0  3.0
+ 5.0  5.0  3.0
 ```
+"""
+struct BroadcastStack{F, D} <: Function
+    f :: F
+    dims :: D
+end
+
+(s::BroadcastStack)(x, y) = broadcaststack(x, y, s.f, s.dims)
+
+function _union(axes_x_dim::AbstractUnitRange, axes_y_dim::AbstractUnitRange)
+    axes_dim_min = min(minimum(axes_x_dim), minimum(axes_y_dim))
+    axes_dim_max = max(maximum(axes_x_dim), maximum(axes_y_dim))
+    axes_dim = axes_dim_min:axes_dim_max
+end
+_union(axes_x_dim::Base.OneTo, axes_y_dim::Base.OneTo) = axes_x_dim ∪ axes_y_dim
+
+_maybeUnitRange(ax::AbstractUnitRange) = UnitRange(ax)
+_maybeUnitRange(ax::Base.OneTo) = ax
+
+function _subsetaxes(f, axes_x, axes_y, dims)
+    ax = collect(_maybeUnitRange.(axes_x))
+    for dim in dims
+        ax[dim] = f(axes_x[dim], axes_y[dim])
+    end
+    ntuple(i -> ax[i], length(axes_x))
+end
+
+function broadcaststack(x::AbstractArray, y::AbstractArray, f, dims)
+    ndims(x) == ndims(y) || throw(DimensionMismatch("arrays must have the same number of dimensions"))
+
+    for dim in 1:ndims(x)
+        if dim ∈ dims
+            if dim > ndims(x)
+                throw(ArgumentError("dim must lie in 1 <= dim <= ndims(x)"))
+            end
+        else
+            axes(x, dim) == axes(y, dim) || throw(DimensionMismatch("non-concatenated axes must be identical"))
+        end
+    end
+
+    axes_cat = _subsetaxes(_union, axes(x), axes(y), dims)
+
+    xy_cat = similar(x, promote_type(eltype(x), eltype(y)), axes_cat)
+    eltype(xy_cat) <: Number && fill!(xy_cat, zero(eltype(xy_cat)))
+
+    common_ax = CartesianIndices(_subsetaxes(intersect, axes(x), axes(y), dims))
+
+    for arr in (x, y)
+        @inbounds for I in CartesianIndices(arr)
+            I in  common_ax && continue
+            xy_cat[I] = arr[I]
+        end
+    end
+
+    @inbounds for I in common_ax
+        xy_cat[I] = f(x[I], y[I])
+    end
+
+    xy_cat
+end
 
-See also: [`sumcat_aligned`](@ref), [`sumvcat_aligned`](@ref)
 """
-function sumhcat_aligned(A::AbstractArray{T,N}...) where {T,N}
+    Flip(f)
 
-	N >= 2 || throw(ArgumentError("all the arrays need to have at least 2 dimensions"))
-	checkdims(A, 2)
+Flip the arguments of a binary function `f`, so that `Flip(f)(x, y) == f(y,x)`.
 
-	axmin = minimum(minimum.(axes.(A, 2)))
-	axmax = maximum(maximum.(axes.(A, 2)))
-	
-	axcat = axmin:axmax
+# Examples
+```jldoctest flip
+julia> flip1 = ParallelUtilities.Flip(vcat);
+
+julia> flip1(2, 3)
+2-element $(Vector{Int}):
+ 3
+ 2
+```
 
-	trailing_axes = Base.tail(Base.tail(axes(first(A))))
-	
-	arr = OffsetArray{T,N}(undef, axes(first(A),1), axcat, trailing_axes...)
-	fill!(arr, zero(T))
+Two flips pop the original function back:
 
-	for axt in CartesianIndices(trailing_axes)
-		for a in A, ind2 in axes(a,2), ind1 in axes(a,1)
-			arr[ind1, ind2, axt] += a[ind1, ind2, axt]
-		end
-	end
+```jldoctest flip
+julia> flip2 = ParallelUtilities.Flip(flip1);
 
-	arr
+julia> flip2(2, 3)
+2-element $(Vector{Int}):
+ 2
+ 3
+```
+"""
+struct Flip{F} <: Function
+    f :: F
 end
 
-function sumhcat_aligned(A::AbstractArray)
-	ndims(A) >= 2 || throw(ArgumentError("the array needs to have at least 2 dimensions"))
-	A
-end
\ No newline at end of file
+(o::Flip)(x, y) = o.f(y, x)
+
+Flip(o::Flip) = o.f
+
+# Perserve the commutative tag
+Flip(c::Commutative) = Commutative(Flip(c.f))
+Flip(b::BroadcastFunction{1}) = BroadcastFunction{2}(Flip(b.f))
+Flip(b::BroadcastFunction{2}) = BroadcastFunction{1}(Flip(b.f))
diff --git a/src/trees.jl b/src/trees.jl
index 81c7bcd..a73d561 100644
--- a/src/trees.jl
+++ b/src/trees.jl
@@ -1,312 +1,202 @@
-const RemoteChannelContainer{T} = NamedTuple{(:out, :err),Tuple{RemoteChannel{Channel{T}},RemoteChannel{Channel{Bool}}}}
+abstract type BinaryTree end
 
-@inline Base.eltype(::RemoteChannelContainer{T}) where {T} = T
-
-function RemoteChannelContainer{T}(n::Int,p::Int) where {T}
-	out = RemoteChannel(()->Channel{T}(n),p)
-	err = RemoteChannel(()->Channel{Bool}(n),p)
-    RemoteChannelContainer{T}((out,err))
-end
-RemoteChannelContainer{T}(n::Int) where {T} = RemoteChannelContainer{T}(n,myid())
-RemoteChannelContainer(n::Int,p::Int) = RemoteChannelContainer{Any}(n,p)
-RemoteChannelContainer(n::Int) = RemoteChannelContainer{Any}(n,myid())
-
-abstract type Tree end
-abstract type BinaryTree <: Tree end
-
-struct SequentialBinaryTree{T<:AbstractVector{<:Integer}} <: BinaryTree
-	#= Tree of the form 
-					1
-			2				3
-		4		5		6		7
-	8		9
-	=#
-	N :: Int # total number of nodes
-	twochildendind :: Int
-	onechildendind :: Int
-	procs :: T
-
-	function SequentialBinaryTree(procs::T) where {T<:AbstractVector{Int}}
-
-		N = length(procs)
-		(N >=1) || throw(DomainError(N,
-			"need at least one node to create a binary tree"))
-
-		Ninternalnodes = prevpow(2,N) - 1
-		Nleaf = N - Ninternalnodes
-		Nonechildinternalnodes = (Ninternalnodes > 0) ? rem(Nleaf,2) : 0
-		twochildendind = div(N-1, 2)
-		onechildstartind = twochildendind + 1
-		onechildendind = onechildstartind + Nonechildinternalnodes - 1
-
-		new{T}(N,twochildendind,onechildendind,procs)
-	end
-end
-
-struct OrderedBinaryTree{T<:AbstractVector{<:Integer}} <: BinaryTree
+struct OrderedBinaryTree{PROCS <: AbstractVector{<:Integer}, PARENT <: Union{Integer, Nothing}} <: BinaryTree
 	#= Tree of the form
 
 							8
 				4						9
 		2				6
 	1		3		5		7
-					
+
 	The left branch has smaller numbers than the node, and the right
 	branch has larger numbers
+
+    A parent of nothing implies that the top node is its own parent
 	=#
 
 	N :: Int
-	procs :: T
+	procs :: PROCS
+	topnode_parent :: PARENT
 
-	function OrderedBinaryTree(procs::T) where {T<:AbstractVector{<:Integer}}
+	function OrderedBinaryTree(procs::AbstractVector{<:Integer}, p = nothing)
 		N = length(procs)
-		N >= 1 || 
-		throw(DomainError(N,
-			"need at least one node to create a BinaryTree"))
+		N >= 1 || throw(DomainError(N, "need at least one node to create a BinaryTree"))
 
-		new{T}(N,procs)
+		new{typeof(procs), typeof(p)}(N, procs, p)
 	end
 end
+Base.length(tree::OrderedBinaryTree) = length(tree.procs)
 
-abstract type SegmentedBinaryTree <: BinaryTree end
-
-struct SegmentedSequentialBinaryTree{T<:AbstractVector{<:Integer},
-	D<:AbstractDict} <: SegmentedBinaryTree
-	#=
-		Each node on the cluster will have its own tree that carries out
-		a local reduction. There will  be one master node on the cluster that
-		will acquire the reduced value on each node. This will be followed 
-		by a tree to carry out reduction among the master nodes. The 
-		eventual reduced result will be returned to the calling process.
-	=#
-	N :: Int
-	procs :: T
+# Special type for the top tree that correctly returns nchildren for the leaves
+struct ConnectedOrderedBinaryTree{OBT <: OrderedBinaryTree, D <: AbstractDict} <: BinaryTree
+	tree :: OBT
 	workersonhosts :: D
-	toptree :: SequentialBinaryTree{Vector{Int}}
-	nodetreestartindices :: Vector{Int}
+
+	function ConnectedOrderedBinaryTree(tree::OBT, workersonhosts::D) where {OBT <: OrderedBinaryTree, D <: AbstractDict}
+		new{OBT, D}(tree, workersonhosts)
+	end
 end
+Base.length(tree::ConnectedOrderedBinaryTree) = length(tree.tree)
+workersonhosts(tree::ConnectedOrderedBinaryTree) = tree.workersonhosts
 
-struct SegmentedOrderedBinaryTree{T<:AbstractVector{<:Integer},
-	D<:AbstractDict} <: SegmentedBinaryTree
+struct SegmentedOrderedBinaryTree{PROCS <: AbstractVector{<:Integer}, TREE <: ConnectedOrderedBinaryTree} <: BinaryTree
 	#=
 		Each node on the cluster will have its own tree that carries out
 		a local reduction. There will  be one master node on the cluster that
-		will acquire the reduced value on each node. This will be followed 
-		by a tree to carry out reduction among the master nodes. The 
+		will acquire the reduced value on each node. This will be followed
+		by a tree to carry out reduction among the master nodes. The
 		eventual reduced result will be returned to the calling process.
 	=#
 	N :: Int
-	procs :: T
-	workersonhosts :: D
-	toptree :: OrderedBinaryTree{Vector{Int}}
+	procs :: PROCS
+	toptree :: TREE
 	nodetreestartindices :: Vector{Int}
-end
-
-function leavesateachlevelfulltree(::Type{<:SequentialBinaryTree},Nleaves)
-	Nnodes = 2Nleaves-1
-	Nlevels = levels(Nnodes)
-	Nleaves_lowestlevel = Nnodes - ((1 << (Nlevels - 1)) - 1)
 
-	return Nleaves_lowestlevel, Nnodes, Nlevels
-end
-
-function leafrankfoldedtree(::SequentialBinaryTree,Nleaves,leafno)
+	function SegmentedOrderedBinaryTree(N::Int, procs::PROCS,
+		toptree::TREE, nodetreestartindices::Vector{Int}) where {PROCS, TREE <: ConnectedOrderedBinaryTree}
 
-	@assert(leafno <= Nleaves,"leafno needs to be ⩽ Nleaves")
-	
-	Nleaves_lowestlevel, Nnodes, Nlevels = 
-		leavesateachlevelfulltree(SequentialBinaryTree,Nleaves)
+		# check that the reduction nodes of the top tree have children
+		all(i -> nchildren(toptree[i]) == 2, 2:2:length(toptree)) || throw(ArgumentError("reduction nodes on the top tree must have 2 children each"))
 
-	if leafno <= Nleaves_lowestlevel
-		leafrank = (1 << (Nlevels - 1)) - 1 + leafno
-	else
-		leafrank = Nnodes - Nleaves + leafno - Nleaves_lowestlevel
+		new{PROCS, TREE}(N, procs, toptree, nodetreestartindices)
 	end
-
-	return leafrank
-end
-
-function leafrankfoldedtree(::OrderedBinaryTree,Nleaves,leafno)
-	@assert(leafno <= Nleaves,"leafno needs to be ⩽ Nleaves")
-	leafrank = 2leafno - 1
 end
 
-function foldedbinarytreefromleaves(::Type{SequentialBinaryTree},leaves)
-	Nleaves = length(leaves)
-	Nleaves_lowestlevel,Nnodes = 
-	leavesateachlevelfulltree(SequentialBinaryTree,Nleaves)
-
-	treeprocs = Vector{Int}(undef,Nnodes)
-
-	# fill in the leaves
-	@views treeprocs[end - Nleaves_lowestlevel + 1:end] .= 
-		leaves[1:Nleaves_lowestlevel]
-	@views treeprocs[end - Nleaves + 1:end - Nleaves_lowestlevel] .= 
-		leaves[Nleaves_lowestlevel+1:end]
-
-	# fill in the parent nodes
-	for rank in Nnodes-1:-2:2
-		p = treeprocs[rank]
-		parentrank = div(rank,2)
-		treeprocs[parentrank] = p
-	end
+workersonhosts(tree::SegmentedOrderedBinaryTree) = workersonhosts(tree.toptree)
 
-	SequentialBinaryTree(treeprocs)
+function leafrankfoldedtree(::OrderedBinaryTree, Nleaves, leafno)
+	@assert(leafno <= Nleaves, "leafno needs to be ⩽ Nleaves")
+	leafrank = 2leafno - 1
 end
+leafrankfoldedtree(tree::ConnectedOrderedBinaryTree, args...) = leafrankfoldedtree(tree.tree, args...)
 
-function foldedbinarytreefromleaves(::Type{OrderedBinaryTree},leaves)
+function foldedbinarytreefromleaves(leaves)
 	Nleaves = length(leaves)
-	Nnodes = 2Nleaves-1
+	Nnodes = 2Nleaves - 1
 
-	allnodes = Vector{Int}(undef,Nnodes)
-	foldedbinarytreefromleaves!(OrderedBinaryTree,allnodes,leaves)
+	allnodes = Vector{Int}(undef, Nnodes)
+	foldedbinarytreefromleaves!(allnodes, leaves)
 
 	OrderedBinaryTree(allnodes)
 end
 
-function foldedbinarytreefromleaves!(::Type{OrderedBinaryTree},allnodes,leaves)
+function foldedbinarytreefromleaves!(allnodes, leaves)
 	top = topnoderank(OrderedBinaryTree(1:length(allnodes)))
 	allnodes[top] = first(leaves)
 
 	length(allnodes) == 1 && return
 
 	Nnodes_left = top - 1
-	Nleaves_left = div( Nnodes_left + 1 , 2)
+	Nleaves_left = div(Nnodes_left + 1 , 2)
 	Nleaves_right = length(leaves) - Nleaves_left
 
 	if Nleaves_left > 0
 		leaves_left = @view leaves[1:Nleaves_left]
 		leftnodes = @view allnodes[1:Nnodes_left]
-		foldedbinarytreefromleaves!(OrderedBinaryTree,leftnodes,leaves_left)
+		foldedbinarytreefromleaves!(leftnodes, leaves_left)
 	end
 
 	if Nleaves_right > 0
 		leaves_right = @view leaves[end - Nleaves_right + 1:end]
 		rightnodes = @view allnodes[top + 1:end]
-		foldedbinarytreefromleaves!(OrderedBinaryTree,rightnodes,leaves_right)
-	end
-end
-
-# Tree with a distribution of hosts specified by workersonhosts
-# workersonhosts is a Dict that maps (host=>workers)
-function SegmentedSequentialBinaryTree(procs::AbstractVector{<:Integer},
-	workersonhosts::AbstractDict{String,<:AbstractVector{<:Integer}})
-	
-	Np = length(procs)
-	Np >= 1 || throw(DomainError(Np,
-		"need at least one node to create a BinaryTree"))
-
-	nodes = collect(keys(workersonhosts))
-	masternodes = Vector{Int}(undef,length(nodes))
-	for (nodeind,node) in enumerate(nodes)
-		workersnode = workersonhosts[node]
-		nodetree = SequentialBinaryTree(workersnode)
-		masternodes[nodeind] = topnode(nodetree).p
-	end
-	Nleaves = length(masternodes)
-	toptree = foldedbinarytreefromleaves(SequentialBinaryTree,masternodes)
-
-	toptreenonleafnodes = length(toptree) - Nleaves
-	Nnodestotal = toptreenonleafnodes + length(procs)
-
-	nodetreestartindices = Vector{Int}(undef,length(nodes))
-	nodetreestartindices[1] = toptreenonleafnodes + 1
-	for (nodeno,node) in enumerate(nodes)
-		nodeno == 1 && continue
-		prevnode = nodes[nodeno-1]
-		nodetreestartindices[nodeno] = nodetreestartindices[nodeno-1] + 
-										length(workersonhosts[prevnode])
+		foldedbinarytreefromleaves!(rightnodes, leaves_right)
 	end
 
-	SegmentedSequentialBinaryTree(Nnodestotal,procs,workersonhosts,
-		toptree,nodetreestartindices)
-end
-
-function SegmentedSequentialBinaryTree(procs::AbstractVector{<:Integer})
-	workersonhosts = procs_node(procs)
-	SegmentedSequentialBinaryTree(procs,workersonhosts)
+    return allnodes
 end
 
-function SegmentedOrderedBinaryTree(procs::AbstractVector{<:Integer},
-	workersonhosts::AbstractDict{String,<:AbstractVector{<:Integer}})
-	
+function SegmentedOrderedBinaryTree(procs::AbstractVector{<:Integer}, workersonhosts::AbstractDict = procs_node(procs))
 	Np = length(procs)
-	Np >= 1 || throw(DomainError(Np,
-		"need at least one node to create a BinaryTree"))
+	Np >= 1 || throw(DomainError(Np, "need at least one node to create a BinaryTree"))
+
+    sum(length, values(workersonhosts)) == length(procs) || throw(ArgumentError("procs $procs do not match workersonhosts $workersonhosts"))
 
 	nodes = collect(keys(workersonhosts))
-	masternodes = Vector{Int}(undef,length(nodes))
-	for (nodeind,node) in enumerate(nodes)
+	masternodes = Vector{Int}(undef, length(nodes))
+	for (nodeind, node) in enumerate(nodes)
 		workersnode = workersonhosts[node]
 		nodetree = OrderedBinaryTree(workersnode)
 		masternodes[nodeind] = topnode(nodetree).p
 	end
 	Nleaves = length(masternodes)
-	toptree = foldedbinarytreefromleaves(OrderedBinaryTree,masternodes)
+	toptree_inner = foldedbinarytreefromleaves(masternodes)
+	toptree = ConnectedOrderedBinaryTree(toptree_inner, workersonhosts)
 
 	toptreenonleafnodes = length(toptree) - Nleaves
 	Nnodestotal = toptreenonleafnodes + length(procs)
 
-	nodetreestartindices = Vector{Int}(undef,length(nodes))
+	nodetreestartindices = Vector{Int}(undef, length(nodes))
 	nodetreestartindices[1] = toptreenonleafnodes + 1
-	for (nodeno,node) in enumerate(nodes)
+	for (nodeno, node) in enumerate(nodes)
 		nodeno == 1 && continue
-		prevnode = nodes[nodeno-1]
-		nodetreestartindices[nodeno] = nodetreestartindices[nodeno-1] + 
-										length(workersonhosts[prevnode])
+		prevnode = nodes[nodeno - 1]
+		nodetreestartindices[nodeno] = nodetreestartindices[nodeno - 1] + length(workersonhosts[prevnode])
 	end
 
-	SegmentedOrderedBinaryTree(Nnodestotal,procs,workersonhosts,
-		toptree,nodetreestartindices)
-end
-
-function SegmentedOrderedBinaryTree(procs::AbstractVector{<:Integer})
-	workersonhosts = procs_node(procs)
-	SegmentedOrderedBinaryTree(procs,workersonhosts)
+	SegmentedOrderedBinaryTree(Nnodestotal, procs, toptree, nodetreestartindices)
 end
 
 # for a single host there are no segments
-function unsegmentedtree(::Type{<:SegmentedSequentialBinaryTree})
-	SequentialBinaryTree
-end
-function unsegmentedtree(::Type{<:SegmentedOrderedBinaryTree})
-	OrderedBinaryTree
-end
-function unsegmentedtree(tree::SegmentedBinaryTree)
-	T = unsegmentedtree(typeof(tree))
-	T(tree.procs)
+function unsegmentedtree(tree::SegmentedOrderedBinaryTree)
+	OrderedBinaryTree(workers(tree))
+end
+
+Base.length(tree::SegmentedOrderedBinaryTree) = tree.N
+levels(tree::OrderedBinaryTree) = levels(length(tree))
+levels(n::Integer) = floor(Int, log2(n)) + 1
+
+function Base.summary(io::IO, tree::SegmentedOrderedBinaryTree)
+	Nmasternodes = length(keys(workersonhosts(tree)))
+	toptreenonleafnodes = length(toptree(tree)) - Nmasternodes
+	mapnodes = length(tree) - toptreenonleafnodes
+	print(io, length(tree), "-node ", Base.nameof(typeof(tree)))
+	print(io, " with ", mapnodes, " workers and ", toptreenonleafnodes, " extra reduction node",
+        ifelse(toptreenonleafnodes > 1, "s", ""))
+end
+Base.summary(io::IO, tree::BinaryTree) = print(io, length(tree),"-element ", nameof(typeof(tree)))
+
+function Base.show(io::IO, b::OrderedBinaryTree)
+	print(io, summary(b), "(", workers(b), ") with top node = ", topnode(b))
+end
+function Base.show(io::IO, b::ConnectedOrderedBinaryTree)
+	print(io, summary(b), "(", workers(b), ", ", workersonhosts(b), ")")
+end
+
+function Base.show(io::IO, b::SegmentedOrderedBinaryTree)
+	summary(io, b)
+	println(io)
+	println(io, "toptree => ", toptree(b))
+	println(io, "subtrees start from indices ", b.nodetreestartindices)
+	tt = toptree(b)
+	for (ind, (host, w)) in enumerate(workersonhosts(b))
+		node = tt[2ind - 1]
+		print(io, host, " => ",  OrderedBinaryTree(w, node.parent))
+		if ind != length(workersonhosts(b))
+			println(io)
+		end
+	end
 end
 
-@inline Base.length(tree::BinaryTree) = tree.N
-function levels(tree::Union{SequentialBinaryTree,OrderedBinaryTree})
-	levels(length(tree))
-end
-levels(n::Integer) = floor(Int,log2(n)) + 1
+toptree(tree::SegmentedOrderedBinaryTree) = tree.toptree
 
-Base.summary(io::IO,b::Tree) = print(io,length(b),"-node ",typeof(b))
+function levelfromtop(tree::OrderedBinaryTree, i::Integer)
+	1 <= i <= length(tree) || throw(BoundsError(tree, i))
 
-function levelfromtop(tree::OrderedBinaryTree,i::Integer)
-	1 <= i <= length(tree) || throw(BoundsError(tree,i))
-	
 	top = topnoderank(tree)
 	if i == top
 		return 1
 	elseif i < top
 		subrange = 1:top - 1
 	else
-		subrange = top+1:length(tree)
+		subrange = top + 1:length(tree)
 	end
 	subtree = OrderedBinaryTree(subrange)
-	subindex = searchsortedfirst(subrange,i)
-	1 + levelfromtop(subtree,subindex)
-end
-function levelfromtop(tree::SequentialBinaryTree,i::Integer)
-	1 <= i <= length(tree) || throw(BoundsError(tree,i))
-	floor(Int,log2(i)) + 1
+	subindex = searchsortedfirst(subrange, i)
+	1 + levelfromtop(subtree, subindex)
 end
 
-function parentnoderank(tree::OrderedBinaryTree,i::Integer)
-	1 <= i <= length(tree) || throw(BoundsError(tree,i))
+function parentnoderank(tree::OrderedBinaryTree, i::Integer)
+	1 <= i <= length(tree) || throw(BoundsError(tree, i))
 
 	# The topmost node is its own parent
 	length(tree) == 1 && return 1
@@ -320,65 +210,63 @@ function parentnoderank(tree::OrderedBinaryTree,i::Integer)
 		# ired is necessarily an odd number
 		pow2level = 1 << level # 2^level
 
-		# sgn is +1 if mod(ired,4) = 1, -1 if mod(ired,4) = 3
-		sgn = 2 - mod(ired,4)
+		# sgn is +1 if mod(ired, 4) = 1, -1 if mod(ired, 4) = 3
+		sgn = 2 - mod(ired, 4)
 		return i + sgn * pow2level
 	elseif i > top
 		# right branch, possibly partially formed
 		# Carry out a recursive search
-		subtreeprocs = top+1:length(tree)
+		subtreeprocs = top + 1:length(tree)
 		subtree = OrderedBinaryTree(subtreeprocs)
-		subind = searchsortedfirst(subtreeprocs,i)
+		subind = searchsortedfirst(subtreeprocs, i)
 		if subind == topnoderank(subtree)
 			# This catches the case of there only being a leaf node
-			# in the sub-tree
+			# in the sub - tree
 			return top
 		elseif length(subtreeprocs) == 3
-			# don't subdivide to 1-node trees
-			# this lets us avoid confusing this with the case of 
+			# don't subdivide to 1 - node trees
+			# this lets us avoid confusing this with the case of
 			# the entire tree having only 1 node
 			return subtreeprocs[2]
 		end
-		pid = parentnoderank(subtree,subind)
+		pid = parentnoderank(subtree, subind)
 		return subtreeprocs[pid]
 	end
 end
-function parentnoderank(tree::SequentialBinaryTree,i::Integer)
-	1 <= i <= length(tree) || throw(BoundsError(tree,i))
+parentnoderank(tree::ConnectedOrderedBinaryTree, i::Integer) = parentnoderank(tree.tree, i)
 
-	# only one node
-	i == 1 && return 1
-	div(i,2)
-end
-
-function subtree_rank(tree::SegmentedBinaryTree,i::Integer)
-	Nmasternodes = length(keys(tree.workersonhosts))
+function subtree_rank(tree::SegmentedOrderedBinaryTree, i::Integer)
+	Nmasternodes = length(keys(workersonhosts(tree)))
 	toptreenonleafnodes = length(tree.toptree) - Nmasternodes
 
 	# node on a subtree at a host
 	subnodeno = i - toptreenonleafnodes
 
-	@assert(subnodeno > 0,"i needs to be greater than $(toptreenonleafnodes)")
+	@assert(subnodeno > 0, "i needs to be greater than $(toptreenonleafnodes)")
 
 	# find out which node this lies on
 	nptotalprevhosts = 0
-	for (host,procs) in tree.workersonhosts
+	for (host, procs) in workersonhosts(tree)
 		np = length(procs)
 		if subnodeno <= nptotalprevhosts + np
 			rankinsubtree = subnodeno - nptotalprevhosts
-			T = unsegmentedtree(typeof(tree))
-			subtree = T(tree.workersonhosts[host])
-			return subtree,rankinsubtree,nptotalprevhosts
+            w_host = workersonhosts(tree)[host]
+			subtree = OrderedBinaryTree(w_host)
+			return subtree, rankinsubtree, nptotalprevhosts
 		end
 		nptotalprevhosts += np
 	end
 end
 
-function masternodeindex(tree::SegmentedBinaryTree, p)
-	leafno = 0
-	T = unsegmentedtree(typeof(tree))
-	for (ind,w) in enumerate(values(tree.workersonhosts))
-		subtree = T(w)
+"""
+	masternodeindex(tree::SegmentedOrderedBinaryTree, p)
+
+Given the top worker `p` on one node, compute the serial order of the host that it corresponds to.
+"""
+function masternodeindex(tree::SegmentedOrderedBinaryTree, p)
+	leafno = nothing
+	for (ind, w) in enumerate(values(workersonhosts(tree)))
+		subtree = OrderedBinaryTree(w)
 		top = topnoderank(subtree)
 		if w[top] == p
 			leafno = ind
@@ -388,66 +276,15 @@ function masternodeindex(tree::SegmentedBinaryTree, p)
 	return leafno
 end
 
-toptree_to_fulltree_index(::SequentialBinaryTree, i) = i
-toptree_to_fulltree_index(::OrderedBinaryTree, i) = div(i,2)
+toptree_to_fulltree_index(::OrderedBinaryTree, i) = div(i, 2)
+toptree_to_fulltree_index(tree::ConnectedOrderedBinaryTree, i) = toptree_to_fulltree_index(tree.tree, i)
 
-fulltree_to_toptree_index(::SequentialBinaryTree, i) = i
 fulltree_to_toptree_index(::OrderedBinaryTree, i) = 2i
+fulltree_to_toptree_index(tree::ConnectedOrderedBinaryTree, i) = fulltree_to_toptree_index(tree.tree, i)
 
-function parentnoderank(tree::SegmentedBinaryTree,i::Integer)
-	1 <= i <= length(tree) || throw(BoundsError(tree,i))
+function nchildren(tree::OrderedBinaryTree, i::Integer)
+	1 <= i <= length(tree) || throw(BoundsError(tree, i))
 
-	Nmasternodes = length(keys(tree.workersonhosts))
-	toptreenonleafnodes = length(tree.toptree) - Nmasternodes
-
-	if toptreenonleafnodes == 0
-		pr = parentnoderank(unsegmentedtree(tree),i)
-
-	elseif i <= toptreenonleafnodes
-		#= In a SegmentedSequentialBinaryTree the leading indices
-		are the parent nodes of the top tree, so ind = i
-		In a SegmentedOrderedBinaryTree, the leaves are removed 
-		from the top tree, so only even numbers are left.
-		In this case, index i of the full tree refers to index 2i of the 
-		top tree, so ind = 2i
-		=#
-		ind = fulltree_to_toptree_index(tree.toptree,i)
-		p = tree.toptree[ind].p
-		#= Compute the parent of the node with rank ind on the top tree.
-		In a SegmentedSequentialBinaryTree this is what we want.
-		In a SegmentedOrderedBinaryTree, we need to convert this back to 
-		the index of the full tree, that is div(pr,2)
-		=# 
-		pr_top = parentnoderank(tree.toptree,ind)
-		pr = toptree_to_fulltree_index(tree.toptree, pr_top)
-		
-	else
-		subtree,rankinsubtree,nptotalprevhosts = subtree_rank(tree,i)
-
-		if rankinsubtree == topnoderank(subtree)
-			# masternode
-			# parent will be on the top-tree
-			p = subtree[rankinsubtree].p
-			leafno = masternodeindex(tree,p)
-			Nmasternodes = length(keys(tree.workersonhosts))
-			leafrank = leafrankfoldedtree(tree.toptree, Nmasternodes,leafno)
-			pr_top = parentnoderank(tree.toptree, leafrank)
-			# Convert back to the rank on the full tree where the 
-			# leaves of the top tree aren't stored.
-			pr = toptree_to_fulltree_index(tree.toptree, pr_top)
-		else
-			# node on a sub-tree
-			pr = parentnoderank(subtree,rankinsubtree)
-			pr += nptotalprevhosts + toptreenonleafnodes
-		end		
-	end
-
-	return pr
-end
-
-function nchildren(tree::OrderedBinaryTree,i::Integer)
-	1 <= i <= length(tree) || throw(BoundsError(tree,i))
-	
 	if isodd(i)
 		0
 	elseif i == length(tree)
@@ -456,57 +293,69 @@ function nchildren(tree::OrderedBinaryTree,i::Integer)
 		2
 	end
 end
-function nchildren(tree::SequentialBinaryTree,i::Integer)
-	1 <= i <= length(tree) || throw(BoundsError(tree,i))
-
-	if i <= tree.twochildendind
-		2
-	elseif i <= tree.onechildendind
-		1
-	else
-		0
-	end
-end
-function nchildren(tree::SegmentedBinaryTree,i::Integer)
-	1 <= i <= length(tree) || throw(BoundsError(tree,i))
+function nchildren(tree::SegmentedOrderedBinaryTree, i::Integer)
+	1 <= i <= length(tree) || throw(BoundsError(tree, i))
 
-	Nmasternodes = length(keys(tree.workersonhosts))
+	Nmasternodes = length(keys(workersonhosts(tree)))
 	toptreenonleafnodes = length(tree.toptree) - Nmasternodes
 
 	if toptreenonleafnodes == 0
-		n = nchildren(unsegmentedtree(tree),i)
+		n = nchildren(unsegmentedtree(tree), i)
 
 	elseif i <= toptreenonleafnodes
-		# The top-tree is a full binary tree.
+		# The top - tree is a full binary tree.
 		# Since the leaves aren't stored, every parent node
 		# has 2 children
 		n = 2
 	else
-		subtree,rankinsubtree = subtree_rank(tree,i)
-		n = nchildren(subtree,rankinsubtree)
+		subtree, rankinsubtree = subtree_rank(tree, i)
+		n = nchildren(subtree, rankinsubtree)
 	end
 
 	return n
 end
+function nchildren(tree::ConnectedOrderedBinaryTree, i::Integer)
+	1 <= i <= length(tree) || throw(BoundsError(tree, i))
+	if isodd(i)
+		host = ""
+		for (ind, h) in enumerate(keys(workersonhosts(tree)))
+			if ind == i ÷ 2 + 1
+				host = h
+			end
+		end
+		st = OrderedBinaryTree(workersonhosts(tree)[host])
+		nchildren(topnode(st))
+	else
+		2
+	end
+end
 
-topnoderank(::BinaryTree) = 1
+topnoderank(tree::ConnectedOrderedBinaryTree) = topnoderank(tree.tree)
 function topnoderank(tree::OrderedBinaryTree)
 	1 << (levels(tree) - 1)
 end
 function topnoderank(tree::SegmentedOrderedBinaryTree)
-	Nmasternodes = length(keys(tree.workersonhosts))
+	Nmasternodes = length(keys(workersonhosts(tree)))
 	toptreenonleafnodes = length(tree.toptree) - Nmasternodes
 
 	if toptreenonleafnodes > 0
 		tnr_top = topnoderank(tree.toptree)
 		tnr = toptree_to_fulltree_index(tree.toptree, tnr_top)
 	else
-		tnr = topnoderank(OrderedBinaryTree(tree.procs))
+		tnr = topnoderank(OrderedBinaryTree(workers(tree)))
 	end
 	return tnr
 end
 
-topnode(tree::Tree) = tree[topnoderank(tree)]
+topnode(tree::BinaryTree) = tree[topnoderank(tree)]
+function topnode(tree::OrderedBinaryTree)
+	node = tree[topnoderank(tree)]
+    if tree.topnode_parent === nothing
+	    BinaryTreeNode(node.p, node.p, node.nchildren)
+    else
+        BinaryTreeNode(node.p, tree.topnode_parent, node.nchildren)
+    end
+end
 
 # Indexing into a OrderedBinaryTree produces a BinaryTreeNode
 struct BinaryTreeNode
@@ -514,41 +363,47 @@ struct BinaryTreeNode
 	parent :: Int
 	nchildren :: Int
 
-	function BinaryTreeNode(p::Int,p_parent::Int,nchildren::Int)
-		(0 <= nchildren <= 2) || 
+	function BinaryTreeNode(p::Int, p_parent::Int, nchildren::Int)
+		(0 <= nchildren <= 2) ||
 		throw(DomainError(nchildren,
 			"attempt to construct a binary tree with $nchildren children"))
 
-		new(p,p_parent,nchildren)
+		new(p, p_parent, nchildren)
 	end
 end
 
-function Base.show(io::IO,b::BinaryTreeNode)
+function Base.show(io::IO, b::BinaryTreeNode)
 	print(io,
 		"BinaryTreeNode(p = $(b.p),"*
 		" parent = $(b.parent), nchildren = $(b.nchildren))")
 end
 
-@inline nchildren(b::BinaryTreeNode) = b.nchildren
+nchildren(b::BinaryTreeNode) = b.nchildren
+
+Distributed.workers(tree::OrderedBinaryTree) = tree.procs
+Distributed.workers(tree::ConnectedOrderedBinaryTree) = workers(tree.tree)
+Distributed.workers(tree::SegmentedOrderedBinaryTree) = tree.procs
 
-function Base.getindex(tree::Tree,i::Integer)
-	1 <= i <= length(tree) || throw(BoundsError(tree,i))
+Distributed.nworkers(tree::BinaryTree) = length(workers(tree))
+
+function Base.getindex(tree::BinaryTree, i::Integer)
+	1 <= i <= length(tree) || throw(BoundsError(tree, i))
+
+	procs = workers(tree)
 
-	procs = tree.procs
-	
 	p = procs[i]
-	pr = parentnoderank(tree,i)
+	pr = parentnoderank(tree, i)
 	p_parent = procs[pr]
-	n = nchildren(tree,i)
+	n = nchildren(tree, i)
 
-	BinaryTreeNode(p,p_parent,n)
+	BinaryTreeNode(p, p_parent, n)
 end
 
-function Base.getindex(tree::SegmentedBinaryTree,i::Integer)
-	1 <= i <= length(tree) || throw(BoundsError(tree,i))
+function Base.getindex(tree::SegmentedOrderedBinaryTree, i::Integer)
+	1 <= i <= length(tree) || throw(BoundsError(tree, i))
 
-	Nmasternodes = length(keys(tree.workersonhosts))
-	toptreenonleafnodes = length(tree.toptree) - Nmasternodes
+	Nmasternodes = length(keys(workersonhosts(tree)))
+	toptreenonleafnodes = length(toptree(tree)) - Nmasternodes
 
 	if toptreenonleafnodes == 0
 		return unsegmentedtree(tree)[i]
@@ -556,104 +411,85 @@ function Base.getindex(tree::SegmentedBinaryTree,i::Integer)
 	elseif i <= toptreenonleafnodes
 		#= In a SegmentedSequentialBinaryTree the leading indices
 		are the parent nodes of the top tree, so ind = i
-		In a SegmentedOrderedBinaryTree, the leaves are removed 
+		In a SegmentedOrderedBinaryTree, the leaves are removed
 		from the top tree, so only even numbers are left.
-		In this case, index i of the full tree refers to index 2i of the 
+		In this case, index i of the full tree refers to index 2i of the
 		top tree, so ind = 2i
 		=#
-		ind = fulltree_to_toptree_index(tree.toptree,i)
+		ind = fulltree_to_toptree_index(tree.toptree, i)
 		p = tree.toptree[ind].p
-		pr_top = parentnoderank(tree.toptree,ind)
+		pr_top = parentnoderank(tree.toptree, ind)
 		p_parent = tree.toptree[pr_top].p
 		n = 2
-		return BinaryTreeNode(p,p_parent,n)
+		return BinaryTreeNode(p, p_parent, n)
 	else
-		subtree,rankinsubtree = subtree_rank(tree,i)
+		subtree, rankinsubtree = subtree_rank(tree, i)
 
 		p = subtree[rankinsubtree].p
-		n = nchildren(subtree,rankinsubtree)
+		n = nchildren(subtree, rankinsubtree)
 
 		if rankinsubtree == topnoderank(subtree)
 			# masternode
 			# parent will be on the top tree
-			Nmasternodes = length(keys(tree.workersonhosts))
-			leafno = masternodeindex(tree,p)
-			leafrank = leafrankfoldedtree(tree.toptree, Nmasternodes,leafno)
+			Nmasternodes = length(keys(workersonhosts(tree)))
+			leafno = masternodeindex(tree, p)
+			leafrank = leafrankfoldedtree(tree.toptree, Nmasternodes, leafno)
 			pr_top = parentnoderank(tree.toptree, leafrank)
 			p_parent = tree.toptree[pr_top].p
 		else
-			# node on a sub-tree
-			pr = parentnoderank(subtree,rankinsubtree)
+			# node on a sub - tree
+			pr = parentnoderank(subtree, rankinsubtree)
 			p_parent = subtree[pr].p
 		end
-		return BinaryTreeNode(p,p_parent,n)
+		return BinaryTreeNode(p, p_parent, n)
 	end
 end
 
 # Branches between nodes
 
-struct BranchChannel{Tmap,Tred}
+struct BranchChannel
 	p :: Int
-	selfchannels :: RemoteChannelContainer{Tmap}
-	parentchannels :: RemoteChannelContainer{Tred}
-	childrenchannels :: RemoteChannelContainer{Tred}
+	parentchannel :: RemoteChannel{Channel{Any}}
+	childrenchannel :: RemoteChannel{Channel{Any}}
 	nchildren :: Int
 
-	function BranchChannel(p::Int,selfchannels::RemoteChannelContainer{Tmap},
-		parentchannels::RemoteChannelContainer{Tred},
-		childrenchannels::RemoteChannelContainer{Tred},
-		nchildren::Int) where {Tmap,Tred}
+	function BranchChannel(p::Int, parentchannel::RemoteChannel, childrenchannel::RemoteChannel, nchildren::Int)
 
-		(0 <= nchildren <= 2) || 
+		(0 <= nchildren <= 2) ||
 		throw(DomainError(nchildren,
 			"attempt to construct a binary tree with $nchildren children"))
-	
-		new{Tmap,Tred}(p,selfchannels,parentchannels,childrenchannels,nchildren)
+
+		new(p, parentchannel, childrenchannel, nchildren)
 	end
 end
-@inline nchildren(b::BranchChannel) = b.nchildren
+nchildren(b::BranchChannel) = b.nchildren
 
-function BranchChannel(p::Integer,Tmap,
-	parentchannels::RemoteChannelContainer{Tred},
-	nchildren::Int) where {Tred}
+childrenerror(nchildren) = throw(DomainError(nchildren,
+	"attempt to construct a binary tree with $nchildren children"))
 
-	(0 <= nchildren <= 2) || 
-	throw(DomainError(nchildren,
-		"attempt to construct a binary tree with $nchildren children"))
+function BranchChannel(p::Integer, parentchannel::RemoteChannel, nchildren::Integer)
 
-	Texp = Tuple{RemoteChannelContainer{Tmap},
-		RemoteChannelContainer{Tred}}
+	(0 <= nchildren <= 2) || childrenerror(nchildren)
 
-	selfchannels, childrenchannels = @sync begin
-		selftask = @async RemoteChannelContainer{Tmap}(1,p)
-		childtask = @async RemoteChannelContainer{Tred}(nchildren,p)
-		fetch.((selftask,childtask)) :: Texp
-	end
-	BranchChannel(p,selfchannels,parentchannels,childrenchannels,nchildren)
+	childrenchannel = RemoteChannel(() -> Channel(nchildren), p)
+	BranchChannel(p, parentchannel, childrenchannel, nchildren)
 end
 
-function BranchChannel{Tmap,Tred}(p::Integer,nchildren::Integer) where {Tmap,Tred}
-	(0 <= nchildren <= 2) || 
-	throw(DomainError(nchildren,
-		"attempt to construct a binary tree with $nchildren children"))
+function BranchChannel(p::Integer, nchildren::Integer)
 
-	Texp = Tuple{RemoteChannelContainer{Tred},
-		RemoteChannelContainer{Tmap},
-		RemoteChannelContainer{Tred}}
+	(0 <= nchildren <= 2) || childrenerror(nchildren)
 
-	parentchannels, selfchannels, childrenchannels = 
-	@sync begin
-		parenttask = @async RemoteChannelContainer{Tred}(1,p)
-		selftask = @async RemoteChannelContainer{Tmap}(1,p)
-		childtask = @async RemoteChannelContainer{Tred}(nchildren,p)
-		fetch.((parenttask,selftask,childtask)) :: Texp
+	parentchannel, childrenchannel = @sync begin
+		parenttask = @async RemoteChannel(() -> Channel(1), p)
+		childtask = @async RemoteChannel(() -> Channel(nchildren), p)
+		asyncmap(fetch, (parenttask, childtask))
 	end
-	BranchChannel(p,selfchannels,parentchannels,childrenchannels,nchildren)
+	BranchChannel(p, parentchannel, childrenchannel, nchildren)
 end
 
 function Base.show(io::IO, b::BranchChannel)
 	N = nchildren(b)
-	p_parent = b.parentchannels.out.where
+	p_parent = b.parentchannel.where
 	p = b.p
 
 	if N == 2
@@ -664,205 +500,42 @@ function Base.show(io::IO, b::BranchChannel)
 		str = "Leaf  : "*string(p_parent)*" ← "*string(p)
 	end
 
-	print(io,str)
+	print(io, str)
 end
 
-function finalize_except_wherewhence(r::RemoteChannel)
-	if (myid() != r.where) && (myid() != r.whence)
-		finalize(r)
-	end
-end
-function finalize_except_wherewhence(r::RemoteChannelContainer)
-	finalize_except_wherewhence.((r.out, r.err))
-end
-
-function Base.finalize(r::RemoteChannelContainer)
-	finalize.((r.out, r.err))
-end
-
-function Base.finalize(bc::BranchChannel)
-	finalize.((bc.selfchannels, bc.childrenchannels))
-	finalize_except_wherewhence(bc.parentchannels)
-end
-
-function createbranchchannels!(branches,Tmap,Tred,tree::OrderedBinaryTree,
-	superbranch::BranchChannel)
-
+function createbranchchannels!(branches, tree::OrderedBinaryTree, superbranch::BranchChannel)
 	top = topnoderank(tree)
 	topnode = tree[top]
-	N = nchildren(topnode)
-	p = topnode.p
 
-	topbranchchannels = BranchChannel(p,Tmap,superbranch.childrenchannels,N)
+	topbranchchannels = BranchChannel(topnode.p, superbranch.childrenchannel, nchildren(topnode))
 	branches[top] = topbranchchannels
 
-	length(tree) == 1 && return
-	
-	left_inds = 1:top-1
-	right_inds = top+1:length(tree)
+	length(tree) == 1 && return nothing
 
-	@sync begin
-		@async if !isempty(left_inds)
-			left_child = OrderedBinaryTree(@view tree.procs[left_inds])
-			createbranchchannels!(@view(branches[left_inds]),
-				Tmap,Tred,left_child,topbranchchannels)
-		end
-		@async if !isempty(right_inds)
-			right_child = OrderedBinaryTree(@view tree.procs[right_inds])
-			createbranchchannels!(@view(branches[right_inds]),Tmap,Tred,right_child,topbranchchannels)
-		end
-	end
-	nothing 
-end
-function createbranchchannels(Tmap,Tred,tree::OrderedBinaryTree)
-
-	branches = Vector{BranchChannel{Tmap,Tred}}(undef,length(tree))
-
-	# the topmost node has to be created separately as 
-	# its children will be linked to itself
-	top = topnoderank(tree)
-	topnode = tree[top]
-	N = nchildren(topnode)
-	p = topnode.p
-	topmostbranch = BranchChannel{Tmap,Tred}(p,N)
-	branches[top] = topmostbranch 
-
-	length(tree) == 1 && return branches
-	
-	left_inds = 1:top-1
-	right_inds = top+1:length(tree)
+	left_inds = 1:top - 1
+	right_inds = top + 1:length(tree)
 
 	@sync begin
 		@async if !isempty(left_inds)
-			left_child = OrderedBinaryTree(@view tree.procs[left_inds])
-			createbranchchannels!(@view(branches[left_inds]),
-				Tmap,Tred,left_child,topmostbranch)
+			left_child = OrderedBinaryTree(@view workers(tree)[left_inds])
+			createbranchchannels!(@view(branches[left_inds]), left_child, topbranchchannels)
 		end
 		@async if !isempty(right_inds)
-			right_child = OrderedBinaryTree(@view tree.procs[right_inds])
-			createbranchchannels!(@view(branches[right_inds]),
-				Tmap,Tred,right_child,topmostbranch)
-		end
-	end
-
-	return branches
-end
-
-function createbranchchannels!(branches,Tmap,Tred,tree::SequentialBinaryTree, 
-	finalnoderank = length(tree))
-
-	length(branches) < 2 && return
-
-	# make sure that the parent nodes are populated
-	parentfilled = [Base.Event() for i=1:tree.onechildendind]
-
-	@sync for noderank in 2:finalnoderank
-		@async begin
-			node = tree[noderank]
-			p = node.p
-			pnr = parentnoderank(tree,noderank)
-			# The first node is filled, no need to wait for it
-			if pnr > 1
-				# Wait otherwise for the parent to get filled
-				wait(parentfilled[pnr])
-			end
-			parentnodebranches = branches[pnr]
-			parentchannels = parentnodebranches.childrenchannels
-			b = BranchChannel(p,Tmap,parentchannels,nchildren(node))
-			branches[noderank] = b
-			# If this is a parent node then notify that it's filled
-			if noderank <= tree.onechildendind
-				notify(parentfilled[noderank])
-			end
+			right_child = OrderedBinaryTree(@view workers(tree)[right_inds])
+			createbranchchannels!(@view(branches[right_inds]), right_child, topbranchchannels)
 		end
 	end
+	return nothing
 end
-function createbranchchannels(Tmap,Tred,tree::SequentialBinaryTree)
 
-	branches = Vector{BranchChannel{Tmap,Tred}}(undef,length(tree))
-
-	# the topmost node has to be created separately as 
-	# it is its own parent
-	top = topnoderank(tree)
-	topnode = tree[top]
-	N = nchildren(topnode)
-	p = topnode.p
-	topmostbranch = BranchChannel{Tmap,Tred}(p,N)
-	branches[top] = topmostbranch
+function createbranchchannels(tree::SegmentedOrderedBinaryTree)
 
-	createbranchchannels!(branches,Tmap,Tred,tree)
-
-	return branches
-end
-
-function createbranchchannels(Tmap,Tred,tree::SegmentedSequentialBinaryTree)
-
-	nodes = keys(tree.workersonhosts)
-	toptree = tree.toptree
-	Nmasternodes = length(nodes)
-	toptreenonleafnodes = length(toptree) - Nmasternodes
-
-	branches = Vector{BranchChannel{Tmap,Tred}}(undef,length(tree))
-
-	# populate the top tree other than the masternodes
-	# This is only run if there are multiple hosts
-	if toptreenonleafnodes > 0
-		top = topnoderank(toptree)
-		topnode_toptree = toptree[top]
-		N = nchildren(topnode_toptree)
-		topmostbranch = BranchChannel{Tmap,Tred}(topnode_toptree.p,N)
-		branches[top] = topmostbranch
-		createbranchchannels!(branches,Tmap,Tred,toptree,
-			toptreenonleafnodes)
-	end
-
-	@sync for (nodeno,node) in enumerate(nodes)
-		@async begin
-			# Top node for each subtree (a masternode)
-			workersnode = tree.workersonhosts[node]
-			nodetree = SequentialBinaryTree(workersnode)
-			topnode_nodetree = topnode(nodetree)
-			p = topnode_nodetree.p
-
-			if toptreenonleafnodes > 0
-				# inherit from the parent node
-				leafno = masternodeindex(tree,p)
-				leafrank = leafrankfoldedtree(tree.toptree, Nmasternodes,leafno)
-				parentrank = parentnoderank(toptree,leafrank)
-				parentnodebranches = branches[parentrank]
-				parentchannels = parentnodebranches.childrenchannels
-			else
-				# This happens if there is only one host, 
-				# in which case there's nothing to inherit.
-				# In this case there's no difference between a 
-				# SegmentedSequentialBinaryTree and a SequentialBinaryTree
-				# The top node is created separately as it is its own parent
-				parentchannels = RemoteChannelContainer{Tred}(1,p)
-			end
-
-			b = BranchChannel(p,Tmap,parentchannels,nchildren(topnode_nodetree))
-			nodetreestartindex = tree.nodetreestartindices[nodeno]
-			branches[nodetreestartindex] = b
-
-			# Populate the rest of the tree
-			subtreeinds = StepRangeLen(nodetreestartindex,1,length(nodetree))
-			branchesnode = @view branches[subtreeinds]
-
-			createbranchchannels!(branchesnode,Tmap,Tred,nodetree)	
-		end
-	end
-
-	return branches
-end
-
-function createbranchchannels(Tmap,Tred,tree::SegmentedOrderedBinaryTree)
-
-	nodes = keys(tree.workersonhosts)
+	nodes = keys(workersonhosts(tree))
 	toptree = tree.toptree
 	Nmasternodes = length(nodes)
 	toptreenonleafnodes = length(toptree) - Nmasternodes
 
-	branches = Vector{BranchChannel{Tmap,Tred}}(undef,length(tree))
+	branches = Vector{BranchChannel}(undef, length(tree))
 
 	# populate the top tree other than the masternodes
 	# This is only run if there are multiple hosts
@@ -870,23 +543,21 @@ function createbranchchannels(Tmap,Tred,tree::SegmentedOrderedBinaryTree)
 		topnoderank_toptree = topnoderank(toptree)
 		topnode_toptree = toptree[topnoderank_toptree]
 		N = nchildren(topnode_toptree)
-		topmostbranch = BranchChannel{Tmap,Tred}(topnode_toptree.p,N)
+		topmostbranch = BranchChannel(topnode_toptree.p, N)
 		branches[topnoderank_toptree] = topmostbranch
-		
-		left_inds = 1:topnoderank_toptree-1
-		right_inds = topnoderank_toptree+1:length(toptree)
+
+		left_inds = 1:(topnoderank_toptree - 1)
+		right_inds = (topnoderank_toptree + 1):length(toptree)
 
 		@sync begin
 			@async if !isempty(left_inds)
-				left_child = OrderedBinaryTree(@view toptree.procs[left_inds])
-				createbranchchannels!(@view(branches[left_inds]),
-					Tmap,Tred,left_child,topmostbranch)
+				left_child = OrderedBinaryTree(@view workers(toptree)[left_inds])
+				createbranchchannels!(@view(branches[left_inds]), left_child, topmostbranch)
 			end
 
 			@async if !isempty(right_inds)
-				right_child = OrderedBinaryTree(@view toptree.procs[right_inds])
-				createbranchchannels!(@view(branches[right_inds]),
-					Tmap,Tred,right_child,topmostbranch)
+				right_child = OrderedBinaryTree(@view workers(toptree)[right_inds])
+				createbranchchannels!(@view(branches[right_inds]), right_child, topmostbranch)
 			end
 		end
 
@@ -899,10 +570,10 @@ function createbranchchannels(Tmap,Tred,tree::SegmentedOrderedBinaryTree)
 		end
 	end
 
-	@sync for (nodeno,node) in enumerate(nodes)
+	@sync for (nodeno, node) in enumerate(nodes)
 		@async begin
 			# Top node for each subtree (a masternode)
-			workersnode = tree.workersonhosts[node]
+			workersnode = workersonhosts(tree)[node]
 			nodetree = OrderedBinaryTree(workersnode)
 			top = topnoderank(nodetree)
 			topnode = nodetree[top]
@@ -910,48 +581,40 @@ function createbranchchannels(Tmap,Tred,tree::SegmentedOrderedBinaryTree)
 
 			if toptreenonleafnodes > 0
 				# inherit from the parent node
-				leafno = masternodeindex(tree,p)
-				leafrank = leafrankfoldedtree(tree.toptree, Nmasternodes,leafno)
+				leafno = masternodeindex(tree, p)
+				leafrank = leafrankfoldedtree(tree.toptree, Nmasternodes, leafno)
 				parentrank = parentnoderank(toptree, leafrank)
 				parentrankfulltree = toptree_to_fulltree_index(toptree, parentrank)
 				parentnodebranches = branches[parentrankfulltree]
-				parentchannels = parentnodebranches.childrenchannels
+				parentchannel = parentnodebranches.childrenchannel
 			else
-				#= This happens if there is only one host, 
+				#= This happens if there is only one host,
 				in which case there's nothing to inherit.
-				In this case there's no difference between a 
+				In this case there's no difference between a
 				SegmentedOrderedBinaryTree and an OrderedBinaryTree
 				The top node is created separately as it is its own parent
 				=#
-				parentchannels = RemoteChannelContainer{Tred}(1,p)
+				parentchannel = RemoteChannel(() -> Channel(1), p)
 			end
 
-			topbranchnode = BranchChannel(p,Tmap,parentchannels,nchildren(topnode))
+			topbranchnode = BranchChannel(p, parentchannel, nchildren(topnode))
 			nodetreestartindex = tree.nodetreestartindices[nodeno]
 			branches[nodetreestartindex + top - 1] = topbranchnode
 
 			# Populate the rest of the tree
-			left_inds_nodetree = (1:top-1)
+			left_inds_nodetree = (1:top - 1)
 			left_inds_fulltree = (nodetreestartindex - 1) .+ left_inds_nodetree
-			right_inds_nodetree = top+1:length(nodetree)
+			right_inds_nodetree = top + 1:length(nodetree)
 			right_inds_fulltree = (nodetreestartindex - 1) .+ right_inds_nodetree
 
 			@async if !isempty(left_inds_nodetree)
-				
-				left_child = OrderedBinaryTree(
-					@view nodetree.procs[left_inds_nodetree])
-				
-				createbranchchannels!(@view(branches[left_inds_fulltree]),
-					Tmap,Tred,left_child,topbranchnode)
+				left_child = OrderedBinaryTree(@view workers(nodetree)[left_inds_nodetree])
+				createbranchchannels!(@view(branches[left_inds_fulltree]), left_child, topbranchnode)
 			end
 
 			@async if !isempty(right_inds_nodetree)
-				
-				right_child = OrderedBinaryTree(
-					@view nodetree.procs[right_inds_nodetree])
-
-				createbranchchannels!(@view(branches[right_inds_fulltree]),
-					Tmap,Tred,right_child,topbranchnode)
+				right_child = OrderedBinaryTree(@view workers(nodetree)[right_inds_nodetree])
+				createbranchchannels!(@view(branches[right_inds_fulltree]), right_child, topbranchnode)
 			end
 		end
 	end
@@ -959,16 +622,47 @@ function createbranchchannels(Tmap,Tred,tree::SegmentedOrderedBinaryTree)
 	return branches
 end
 
-function createbranchchannels(Tmap,Tred,iterators::Tuple,T::Type{<:Tree})
-	w = workersactive(iterators)
-	tree = T(w)
-	branches = createbranchchannels(Tmap,Tred,tree)
-	tree,branches
+function createbranchchannels(pool::AbstractWorkerPool, len::Integer)
+	w = workersactive(pool, len)
+	tree = SegmentedOrderedBinaryTree(w)
+	branches = createbranchchannels(tree)
+	tree, branches
+end
+
+topbranch(tree::BinaryTree, branches::AbstractVector{<:BranchChannel}) = branches[topnoderank(tree)]
+
+function workersactive(pool::AbstractWorkerPool, len::Integer,
+	workers_on_hosts::AbstractDict = procs_node(workers(pool)))
+
+	nw = min(nworkers(pool), len)
+	chooseworkers(workers(pool), len, workers_on_hosts)
 end
-function createbranchchannels(iterators::Tuple,T::Type{<:Tree})
-	createbranchchannels(Any,Any,iterators,T)
+
+function chooseworkers(workerspool, n::Integer, workers_on_hosts::AbstractDict = procs_node(workerspool))
+	n >= 1 || throw(ArgumentError("number of workers to choose must be >= 1"))
+	length(workerspool) <= n && return workerspool
+	myhost = Libc.gethostname()
+	if myhost in keys(workers_on_hosts)
+		if length(workers_on_hosts[myhost]) >= n
+			return workers_on_hosts[myhost][1:n]
+		else
+			w_chosen = workers_on_hosts[myhost]
+			np_left = n - length(w_chosen)
+			for (host, workers_host) in workers_on_hosts
+				np_left <= 0 && break
+				host == myhost && continue
+				workers_host_section = @view workers_host[1:min(length(workers_host), np_left)]
+				w_chosen = vcat(w_chosen, workers_host_section)
+				np_left -= length(workers_host_section)
+			end
+			return w_chosen
+		end
+	else
+		return workerspool[1:n]
+	end
 end
 
-function topbranch(tree::Tree,branches::Vector{<:BranchChannel})
-	branches[topnoderank(tree)]
-end
\ No newline at end of file
+function maybetrimmedworkerpool(workers, N)
+	w = chooseworkers(workers, N)
+	WorkerPool(w)
+end
diff --git a/src/utils.jl b/src/utils.jl
deleted file mode 100644
index 5dc886c..0000000
--- a/src/utils.jl
+++ /dev/null
@@ -1,21 +0,0 @@
-"""
-    nworkersactive(iterators::Tuple)
-
-Number of workers required to contain the outer product of the iterators.
-"""
-function nworkersactive(iterators::Tuple)
-    min(nworkers(), prod(length, iterators))
-end
-nworkersactive(ps::AbstractConstrainedProduct) = nworkersactive(getiterators(ps))
-nworkersactive(args::AbstractRange...) = nworkersactive(args)
-
-"""
-    workersactive(iterators::Tuple)
-
-Workers required to split the outer product of the iterators. 
-If `prod(length, iterators) < nworkers()` then the first `prod(length, iterators)`
-workers are chosen.
-"""
-workersactive(iterators::Tuple) = workers()[1:nworkersactive(iterators)]
-workersactive(ps::AbstractConstrainedProduct) = workersactive(getiterators(ps))
-workersactive(args::AbstractRange...) = workersactive(args)
\ No newline at end of file
diff --git a/test/Project.toml b/test/Project.toml
index 7f6e312..817c485 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,5 +1,14 @@
 [deps]
+Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[compat]
+Aqua = "0.5"
+DataStructures = "0.17, 0.18"
+Documenter = "0.26"
+OffsetArrays = "1.6"
diff --git a/test/misctests_singleprocess.jl b/test/misctests_singleprocess.jl
new file mode 100644
index 0000000..7162c03
--- /dev/null
+++ b/test/misctests_singleprocess.jl
@@ -0,0 +1,144 @@
+using DataStructures
+using Test
+using Aqua
+using ParallelUtilities
+using Documenter
+using OffsetArrays
+import ParallelUtilities: pval, value, chooseworkers, BinaryTreeNode, BranchChannel,
+ProductSplit, SegmentedOrderedBinaryTree
+
+@testset "Project quality" begin
+    Aqua.test_all(ParallelUtilities)
+end
+
+DocMeta.setdocmeta!(ParallelUtilities, :DocTestSetup, :(using ParallelUtilities); recursive=true)
+
+@testset "doctest" begin
+    doctest(ParallelUtilities, manual = false)
+end
+
+@testset "pval" begin
+    p1 = pval{Float64}(1, false, 2.0)
+    p2 = pval{Int}(1, false, 2)
+
+    @test pval{Float64}(p1) === p1
+    @test pval{Int}(p1) === p2
+
+    @test value(p1) === 2.0
+    @test value(p2) === 2
+    @test value(2) === 2
+    @test value(nothing) === nothing
+end
+
+@testset "chooseworkers" begin
+    workers = 1:8
+    workers_on_hosts = OrderedDict("host1" => 1:4, "host2" => 5:8)
+    @test chooseworkers(workers, 3, workers_on_hosts) == 1:3
+    @test chooseworkers(workers, 5, workers_on_hosts) == 1:5
+
+    workers_on_hosts = OrderedDict(Libc.gethostname() => 1:4, "host2" => 5:8)
+    @test chooseworkers(workers, 3, workers_on_hosts) == 1:3
+    @test chooseworkers(workers, 5, workers_on_hosts) == 1:5
+
+    workers_on_hosts = OrderedDict("host1" => 1:4, Libc.gethostname() => 5:8)
+    @test chooseworkers(workers, 3, workers_on_hosts) == 5:7
+    @test chooseworkers(workers, 5, workers_on_hosts) == [5:8; 1]
+end
+
+@testset "Reduction functions" begin
+    # BroadcastStack with OffsetArrays
+    @testset "BroadcastStack" begin
+        arr = ParallelUtilities.BroadcastStack(+, 1)(ones(2:4), ones(3:5))
+        @test arr == OffsetArray([1, 2, 2, 1], 2:5)
+
+        arr = ParallelUtilities.BroadcastStack(+, 1:2)(ones(1:2, 2:4), ones(2:3, 3:5))
+        arr_exp = OffsetArray([1.0  1.0  1.0  0.0
+                                1.0  2.0  2.0  1.0
+                                0.0  1.0  1.0  1.0], 1:3, 2:5)
+        @test arr == arr_exp
+    end
+
+    @testset "BroadcastFunction" begin
+        x = ones(3); y = ones(3);
+        b = ParallelUtilities.BroadcastFunction{1}(+)
+        @test b(x, y) == ones(3) * 2
+        @test x == ones(3) * 2
+        @test y == ones(3)
+
+        b = ParallelUtilities.BroadcastFunction{2}(+)
+        x = ones(3); y = ones(3);
+        @test b(x, y) == ones(3) * 2
+        @test x == ones(3)
+        @test y == ones(3) * 2
+    end
+
+    @testset "Flip" begin
+        x = ones(3); y = ones(3);
+        f = ParallelUtilities.Flip(ParallelUtilities.elementwisesum!)
+        @test f(x,y) == ones(3) * 2
+        @test x == ones(3)
+        @test y == ones(3) * 2
+
+        x = ones(3); y = ones(3);
+        f = ParallelUtilities.Flip(ParallelUtilities.broadcastinplace(+, Val(2)))
+        @test f(x,y) == ones(3) * 2
+        @test x == ones(3) * 2
+        @test y == ones(3)
+    end
+end
+
+@testset "show" begin
+
+    @testset "ProductSplit" begin
+        io = IOBuffer()
+        ps = ProductSplit((1:20, 1:30), 4, 1)
+        show(io, ps)
+        showstr = String(take!(io))
+        startstr = string(length(ps))*"-element ProductSplit"
+        @test startswith(showstr, startstr)
+    end
+
+    @testset "error" begin
+        io = IOBuffer()
+
+        showerror(io, ParallelUtilities.TaskNotPresentError((1:4,), (5,)))
+        strexp = "could not find the task $((5,)) in the list $((1:4,))"
+        @test String(take!(io)) == strexp
+    end;
+
+    @testset "BranchChannel" begin
+        io = IOBuffer()
+
+        b = BranchChannel(1, 0)
+        show(io, b)
+        strexp = "Leaf  : 1 ← 1"
+        @test String(take!(io)) == strexp
+
+        b = BranchChannel(1, 1)
+        show(io, b)
+        strexp = "Branch: 1 ← 1 ← 1 child"
+        @test String(take!(io)) == strexp
+
+        b = BranchChannel(1, 2)
+        show(io, b)
+        strexp = "Branch: 1 ← 1 ⇇ 2 children"
+        @test String(take!(io)) == strexp
+    end;
+
+    @testset "BinaryTreeNode" begin
+        io = IOBuffer()
+        b = BinaryTreeNode(2, 3, 1)
+        show(io, b)
+        strexp = "BinaryTreeNode(p = 2, parent = 3, nchildren = 1)"
+        @test String(take!(io)) == strexp
+    end;
+
+    @testset "BinaryTree" begin
+        # check that show is working
+        io = IOBuffer()
+        tree = SegmentedOrderedBinaryTree(1:8, OrderedDict("host1" => 1:4, "host2" => 5:8))
+        show(io, tree)
+        show(io, ParallelUtilities.toptree(tree))
+        show(io, ParallelUtilities.toptree(tree).tree)
+    end
+end;
diff --git a/test/paralleltests.jl b/test/paralleltests.jl
new file mode 100644
index 0000000..58c3eb9
--- /dev/null
+++ b/test/paralleltests.jl
@@ -0,0 +1,677 @@
+using Distributed
+
+@everywhere begin
+    using DataStructures
+    using Test
+    using ParallelUtilities
+    using ParallelUtilities.ClusterQueryUtils
+    using OffsetArrays
+    import ParallelUtilities: BinaryTreeNode, BranchChannel,
+    OrderedBinaryTree, SegmentedOrderedBinaryTree,
+    parentnoderank, nchildren,
+    createbranchchannels,
+    workersactive,
+    leafrankfoldedtree,
+    TopTreeNode, SubTreeNode,
+    NoSplat, reducedvalue
+
+    function parentnoderank(tree::SegmentedOrderedBinaryTree, i::Integer)
+        1 <= i <= length(tree) || throw(BoundsError(tree, i))
+
+        Nmasternodes = length(keys(ParallelUtilities.workersonhosts(tree)))
+        toptreenonleafnodes = length(tree.toptree) - Nmasternodes
+
+        if toptreenonleafnodes == 0
+            pr = parentnoderank(ParallelUtilities.unsegmentedtree(tree),i)
+
+        elseif i <= toptreenonleafnodes
+            #= In a SegmentedSequentialBinaryTree the leading indices
+            are the parent nodes of the top tree, so ind = i
+            In a SegmentedOrderedBinaryTree, the leaves are removed
+            from the top tree, so only even numbers are left.
+            In this case, index i of the full tree refers to index 2i of the
+            top tree, so ind = 2i
+            =#
+            ind = ParallelUtilities.fulltree_to_toptree_index(tree.toptree, i)
+            p = tree.toptree[ind].p
+            #  Compute the parent of the node with rank ind on the top tree.
+            # In a SegmentedSequentialBinaryTree this is what we want.
+            # In a SegmentedOrderedBinaryTree, we need to convert this back to
+            # the index of the full tree, that is div(pr, 2)
+            pr_top = parentnoderank(tree.toptree, ind)
+            pr = ParallelUtilities.toptree_to_fulltree_index(tree.toptree, pr_top)
+        else
+            subtree, rankinsubtree, nptotalprevhosts = ParallelUtilities.subtree_rank(tree, i)
+
+            if rankinsubtree == ParallelUtilities.topnoderank(subtree)
+                # masternode
+                # parent will be on the top - tree
+                p = subtree[rankinsubtree].p
+                leafno = ParallelUtilities.masternodeindex(tree, p)
+                Nmasternodes = length(keys(ParallelUtilities.workersonhosts(tree)))
+                leafrank = ParallelUtilities.leafrankfoldedtree(tree.toptree, Nmasternodes, leafno)
+                pr_top = parentnoderank(tree.toptree, leafrank)
+                # Convert back to the rank on the full tree where the
+                # leaves of the top tree aren't stored.
+                pr = ParallelUtilities.toptree_to_fulltree_index(tree.toptree, pr_top)
+            else
+                # node on a sub - tree
+                pr = parentnoderank(subtree, rankinsubtree)
+                pr += nptotalprevhosts + toptreenonleafnodes
+            end
+        end
+
+        return pr
+    end
+end
+
+macro testsetwithinfo(str, ex)
+    quote
+        @info "Testing "*$str
+        @testset $str begin $(esc(ex)); end;
+    end
+end
+
+fmap_local(x) = x^2
+fred_local(x) = x
+fred_local(x, y) = x + y
+
+function showworkernumber(ind, nw)
+    # Cursor starts off at the beginning of the line
+    print("\u1b[K") # clear till end of line
+    print("Testing on worker $ind of $nw")
+    # return the cursor to the beginning of the line
+    endchar = ind == nw ? "\n" : "\r"
+    print(endchar)
+end
+
+@testsetwithinfo "utilities" begin
+    @testset "hostnames" begin
+        hosts = hostnames()
+        nodes = unique(hosts)
+        @test nodenames() == nodes
+        @test nodenames(hosts) == nodes
+        np1 = nprocs_node(hosts, nodes)
+        np2 = nprocs_node(hosts)
+        np3 = nprocs_node()
+        @test np1 == np2 == np3
+        for node in nodes
+            npnode = count(isequal(node), hosts)
+            @test np1[node] == npnode
+        end
+        p1 = procs_node(workers(), hosts, nodes)
+        for node in nodes
+            pnode = workers()[findall(isequal(node), hosts)]
+            @test p1[node] == pnode
+        end
+        np4 = nprocs_node(p1)
+        @test np1 == np4
+    end
+end;
+
+@testset "BinaryTree" begin
+    @testsetwithinfo "BinaryTreeNode" begin
+        @testset "Constructor" begin
+            p = workers()[1]
+            b = BinaryTreeNode(p, p, 0)
+            @test nchildren(b) == 0
+            b = BinaryTreeNode(p, p, 1)
+            @test nchildren(b) == 1
+            b = BinaryTreeNode(p, p, 2)
+            @test nchildren(b) == 2
+
+            @test_throws DomainError BinaryTreeNode(p, p, 3)
+            @test_throws DomainError BinaryTreeNode(p, p,-1)
+        end
+    end
+
+    @testsetwithinfo "BinaryTree" begin
+        @testsetwithinfo "OrderedBinaryTree" begin
+            @testset "pid and parent" begin
+                for imax = 1:100
+                    procs = 1:imax
+                    tree = OrderedBinaryTree(procs)
+                    @test length(tree) == length(procs)
+
+                    topnoderank = ParallelUtilities.topnoderank(tree)
+                    @test tree[topnoderank].parent == topnoderank
+                    for rank in 1:length(tree)
+                        node = tree[rank]
+                        @test node.p == procs[rank]
+                        @test node.parent == procs[parentnoderank(tree, rank)]
+                    end
+                    @test_throws BoundsError(tree, 0) parentnoderank(tree, 0)
+                    @test_throws BoundsError(tree, imax + 1) parentnoderank(tree, imax + 1)
+                end
+            end
+
+            @testset "nchildren" begin
+                tree = OrderedBinaryTree(1:1)
+                @test nchildren(tree, 1) == nchildren(tree[1]) == tree[1].nchildren == 0
+                @test_throws BoundsError(tree, 0) nchildren(tree, 0)
+                @test_throws BoundsError(tree, 2) nchildren(tree, 2)
+                @test ParallelUtilities.topnoderank(tree) == 1
+
+                tree = OrderedBinaryTree(1:2)
+                @test nchildren(tree, 1) == nchildren(tree[1]) == tree[1].nchildren == 0
+                @test nchildren(tree, 2) == nchildren(tree[2]) == tree[2].nchildren == 1
+                @test_throws BoundsError(tree, 0) nchildren(tree, 0)
+                @test_throws BoundsError(tree, 3) nchildren(tree, 3)
+                @test ParallelUtilities.topnoderank(tree) == 2
+
+                tree = OrderedBinaryTree(1:8)
+                @test nchildren(tree, 1) == nchildren(tree[1]) == tree[1].nchildren == 0
+                @test nchildren(tree, 2) == nchildren(tree[2]) == tree[2].nchildren == 2
+                @test nchildren(tree, 3) == nchildren(tree[3]) == tree[3].nchildren == 0
+                @test nchildren(tree, 4) == nchildren(tree[4]) == tree[4].nchildren == 2
+                @test nchildren(tree, 5) == nchildren(tree[5]) == tree[5].nchildren == 0
+                @test nchildren(tree, 6) == nchildren(tree[6]) == tree[6].nchildren == 2
+                @test nchildren(tree, 7) == nchildren(tree[7]) == tree[7].nchildren == 0
+                @test nchildren(tree, 8) == nchildren(tree[8]) == tree[8].nchildren == 1
+                @test_throws BoundsError(tree, 0) nchildren(tree, 0)
+                @test_throws BoundsError(tree, 9) nchildren(tree, 9)
+                @test ParallelUtilities.topnoderank(tree) == 8
+
+                tree = OrderedBinaryTree(1:11)
+                @test nchildren(tree, 1) == nchildren(tree[1]) == tree[1].nchildren == 0
+                @test nchildren(tree, 2) == nchildren(tree[2]) == tree[2].nchildren == 2
+                @test nchildren(tree, 3) == nchildren(tree[3]) == tree[3].nchildren == 0
+                @test nchildren(tree, 4) == nchildren(tree[4]) == tree[4].nchildren == 2
+                @test nchildren(tree, 5) == nchildren(tree[5]) == tree[5].nchildren == 0
+                @test nchildren(tree, 6) == nchildren(tree[6]) == tree[6].nchildren == 2
+                @test nchildren(tree, 7) == nchildren(tree[7]) == tree[7].nchildren == 0
+                @test nchildren(tree, 8) == nchildren(tree[8]) == tree[8].nchildren == 2
+                @test nchildren(tree, 9) == nchildren(tree[9]) == tree[9].nchildren == 0
+                @test nchildren(tree, 10) == nchildren(tree[10]) == tree[10].nchildren == 2
+                @test nchildren(tree, 11) == nchildren(tree[11]) == tree[11].nchildren == 0
+                @test_throws BoundsError(tree, 0) nchildren(tree, 0)
+                @test_throws BoundsError(tree, 12) nchildren(tree, 12)
+                @test ParallelUtilities.topnoderank(tree) == 8
+
+                tree = OrderedBinaryTree(1:13)
+                @test nchildren(tree, 1) == nchildren(tree[1]) == tree[1].nchildren == 0
+                @test nchildren(tree, 2) == nchildren(tree[2]) == tree[2].nchildren == 2
+                @test nchildren(tree, 3) == nchildren(tree[3]) == tree[3].nchildren == 0
+                @test nchildren(tree, 4) == nchildren(tree[4]) == tree[4].nchildren == 2
+                @test nchildren(tree, 5) == nchildren(tree[5]) == tree[5].nchildren == 0
+                @test nchildren(tree, 6) == nchildren(tree[6]) == tree[6].nchildren == 2
+                @test nchildren(tree, 7) == nchildren(tree[7]) == tree[7].nchildren == 0
+                @test nchildren(tree, 8) == nchildren(tree[8]) == tree[8].nchildren == 2
+                @test nchildren(tree, 9) == nchildren(tree[9]) == tree[9].nchildren == 0
+                @test nchildren(tree, 10) == nchildren(tree[10]) == tree[10].nchildren == 2
+                @test nchildren(tree, 11) == nchildren(tree[11]) == tree[11].nchildren == 0
+                @test nchildren(tree, 12) == nchildren(tree[12]) == tree[12].nchildren == 2
+                @test nchildren(tree, 13) == nchildren(tree[13]) == tree[13].nchildren == 0
+                @test_throws BoundsError(tree, 0) nchildren(tree, 0)
+                @test_throws BoundsError(tree, 14) nchildren(tree, 14)
+                @test ParallelUtilities.topnoderank(tree) == 8
+            end
+
+            @testset "level" begin
+                tree = OrderedBinaryTree(1:15)
+                @test ParallelUtilities.levels(tree) == 4
+
+                @test ParallelUtilities.levelfromtop.((tree,), 1:2:15) == ones(Int, 8).*4
+                @test ParallelUtilities.levelfromtop.((tree,), (2, 6, 10, 14)) == (3, 3, 3, 3)
+                @test ParallelUtilities.levelfromtop.((tree,), (4, 12)) == (2, 2)
+                @test ParallelUtilities.levelfromtop(tree, 8) == 1
+                for p in [0, length(tree) + 1]
+                    @test_throws BoundsError(tree, p) ParallelUtilities.levelfromtop(tree, p)
+                end
+
+                tree = OrderedBinaryTree(1:13)
+                @test ParallelUtilities.levels(tree) == 4
+                @test ParallelUtilities.levelfromtop.((tree,), 1:2:11) == ones(Int, 6).*4
+                @test ParallelUtilities.levelfromtop.((tree,), (2, 6, 10, 13)) == (3, 3, 3, 3)
+                @test ParallelUtilities.levelfromtop.((tree,), (4, 12)) == (2, 2)
+                @test ParallelUtilities.levelfromtop(tree, 8) == 1
+                for p in [0, length(tree) + 1]
+                    @test_throws BoundsError(tree, p) ParallelUtilities.levelfromtop(tree, p)
+                end
+            end
+        end
+
+        @testsetwithinfo "SegmentedOrderedBinaryTree" begin
+            @testsetwithinfo "single host" begin
+                @testset "pid and parent" begin
+                    for imax = 1:100
+                        procs = 1:imax
+                        workersonhosts = Dict("host" => procs)
+                        tree = SegmentedOrderedBinaryTree(procs, workersonhosts)
+                        treeOBT = OrderedBinaryTree(procs)
+                        @test length(tree) == length(procs) == length(treeOBT)
+
+                        topnoderank = ParallelUtilities.topnoderank(tree)
+                        # The top node is its own parent
+                        @test tree[topnoderank].parent == topnoderank
+                        @test tree[topnoderank] == ParallelUtilities.topnode(tree)
+                        for rank in 1:length(tree)
+                            node = tree[rank]
+                            parentnode = tree[parentnoderank(tree, rank)]
+                            @test length(procs) > 1 ? nchildren(parentnode) > 0 : nchildren(parentnode) == 0
+                            @test node.p == procs[rank]
+                            @test node.parent == procs[parentnoderank(treeOBT, rank)]
+                            @test parentnode.p == node.parent
+                        end
+                    end
+                end;
+
+                @testset "nchildren" begin
+                    procs = 1:1
+                    tree = SegmentedOrderedBinaryTree(procs, Dict("host" => procs))
+                    @test nchildren(tree, 1) == nchildren(tree[1]) == tree[1].nchildren == 0
+                    @test_throws BoundsError(tree, 0) nchildren(tree, 0)
+                    @test_throws BoundsError(tree, 2) nchildren(tree, 2)
+                    @test ParallelUtilities.topnoderank(tree) == 1
+
+                    procs = 1:2
+                    tree = SegmentedOrderedBinaryTree(procs, Dict("host" => procs))
+                    @test nchildren(tree, 1) == nchildren(tree[1]) == tree[1].nchildren == 0
+                    @test nchildren(tree, 2) == nchildren(tree[2]) == tree[2].nchildren == 1
+                    @test_throws BoundsError(tree, 0) nchildren(tree, 0)
+                    @test_throws BoundsError(tree, 3) nchildren(tree, 3)
+                    @test ParallelUtilities.topnoderank(tree) == 2
+
+                    procs = 1:8
+                    tree = SegmentedOrderedBinaryTree(procs, Dict("host" => procs))
+                    @test nchildren(tree, 1) == nchildren(tree[1]) == tree[1].nchildren == 0
+                    @test nchildren(tree, 2) == nchildren(tree[2]) == tree[2].nchildren == 2
+                    @test nchildren(tree, 3) == nchildren(tree[3]) == tree[3].nchildren == 0
+                    @test nchildren(tree, 4) == nchildren(tree[4]) == tree[4].nchildren == 2
+                    @test nchildren(tree, 5) == nchildren(tree[5]) == tree[5].nchildren == 0
+                    @test nchildren(tree, 6) == nchildren(tree[6]) == tree[6].nchildren == 2
+                    @test nchildren(tree, 7) == nchildren(tree[7]) == tree[7].nchildren == 0
+                    @test nchildren(tree, 8) == nchildren(tree[8]) == tree[8].nchildren == 1
+                    @test_throws BoundsError(tree, 0) nchildren(tree, 0)
+                    @test_throws BoundsError(tree, 9) nchildren(tree, 9)
+                    @test ParallelUtilities.topnoderank(tree) == 8
+
+                    procs = 1:11
+                    tree = SegmentedOrderedBinaryTree(procs, Dict("host" => procs))
+                    @test nchildren(tree, 1) == nchildren(tree[1]) == tree[1].nchildren == 0
+                    @test nchildren(tree, 2) == nchildren(tree[2]) == tree[2].nchildren == 2
+                    @test nchildren(tree, 3) == nchildren(tree[3]) == tree[3].nchildren == 0
+                    @test nchildren(tree, 4) == nchildren(tree[4]) == tree[4].nchildren == 2
+                    @test nchildren(tree, 5) == nchildren(tree[5]) == tree[5].nchildren == 0
+                    @test nchildren(tree, 6) == nchildren(tree[6]) == tree[6].nchildren == 2
+                    @test nchildren(tree, 7) == nchildren(tree[7]) == tree[7].nchildren == 0
+                    @test nchildren(tree, 8) == nchildren(tree[8]) == tree[8].nchildren == 2
+                    @test nchildren(tree, 9) == nchildren(tree[9]) == tree[9].nchildren == 0
+                    @test nchildren(tree, 10) == nchildren(tree[10]) == tree[10].nchildren == 2
+                    @test nchildren(tree, 11) == nchildren(tree[11]) == tree[11].nchildren == 0
+                    @test_throws BoundsError(tree, 0) nchildren(tree, 0)
+                    @test_throws BoundsError(tree, 12) nchildren(tree, 12)
+                    @test ParallelUtilities.topnoderank(tree) == 8
+
+                    procs = 1:13
+                    tree = SegmentedOrderedBinaryTree(procs, Dict("host" => procs))
+                    @test nchildren(tree, 1) == nchildren(tree[1]) == tree[1].nchildren == 0
+                    @test nchildren(tree, 2) == nchildren(tree[2]) == tree[2].nchildren == 2
+                    @test nchildren(tree, 3) == nchildren(tree[3]) == tree[3].nchildren == 0
+                    @test nchildren(tree, 4) == nchildren(tree[4]) == tree[4].nchildren == 2
+                    @test nchildren(tree, 5) == nchildren(tree[5]) == tree[5].nchildren == 0
+                    @test nchildren(tree, 6) == nchildren(tree[6]) == tree[6].nchildren == 2
+                    @test nchildren(tree, 7) == nchildren(tree[7]) == tree[7].nchildren == 0
+                    @test nchildren(tree, 8) == nchildren(tree[8]) == tree[8].nchildren == 2
+                    @test nchildren(tree, 9) == nchildren(tree[9]) == tree[9].nchildren == 0
+                    @test nchildren(tree, 10) == nchildren(tree[10]) == tree[10].nchildren == 2
+                    @test nchildren(tree, 11) == nchildren(tree[11]) == tree[11].nchildren == 0
+                    @test nchildren(tree, 12) == nchildren(tree[12]) == tree[12].nchildren == 2
+                    @test nchildren(tree, 13) == nchildren(tree[13]) == tree[13].nchildren == 0
+                    @test_throws BoundsError(tree, 0) nchildren(tree, 0)
+                    @test_throws BoundsError(tree, 14) nchildren(tree, 14)
+                    @test ParallelUtilities.topnoderank(tree) == 8
+                end;
+            end;
+
+            @testsetwithinfo "multiple hosts" begin
+                @testset "length" begin
+                    procs = 1:2
+                    tree = SegmentedOrderedBinaryTree(procs,
+                        OrderedDict("host1" => 1:1,"host2" => 2:2))
+                    @test length(tree) == 2 + 1
+
+                    procs = 1:4
+                    tree = SegmentedOrderedBinaryTree(procs,
+                        OrderedDict("host1" => 1:2,"host2" => 3:4))
+
+                    @test length(tree) == 4 + 1
+
+                    procs = 1:12
+                    tree = SegmentedOrderedBinaryTree(procs,
+                        OrderedDict(
+                            "host1" => 1:3,"host2" => 4:6,
+                            "host3" => 7:9,"host4" => 10:12))
+
+                    @test length(tree) == 12 + 3
+                end;
+
+                @testset "leafrankfoldedtree" begin
+                    treeflag = OrderedBinaryTree(1:1)
+                    @test leafrankfoldedtree(treeflag, 5, 1) == 1
+                    @test leafrankfoldedtree(treeflag, 5, 2) == 3
+                    @test leafrankfoldedtree(treeflag, 5, 3) == 5
+                    @test leafrankfoldedtree(treeflag, 5, 4) == 7
+                    @test leafrankfoldedtree(treeflag, 5, 5) == 9
+                end;
+
+                @testset "pid and parent" begin
+                    for imax = 2:100
+                        procs = 1:imax
+                        mid = div(imax, 2)
+                        workersonhosts = OrderedDict{String, Vector{Int}}()
+                        workersonhosts["host1"] = procs[1:mid]
+                        workersonhosts["host2"] = procs[mid + 1:end]
+                        tree = SegmentedOrderedBinaryTree(procs, workersonhosts)
+
+                        top = ParallelUtilities.topnoderank(tree)
+                        @test tree[top] == ParallelUtilities.topnode(tree)
+                        for (ind, rank) in enumerate(1:mid)
+                            node = tree[rank + 1]
+                            parentnode = tree[parentnoderank(tree, rank + 1)]
+                            @test parentnode.p == node.parent
+                            pnodes = workersonhosts["host1"]
+                            @test node.p == pnodes[ind]
+                            OBT = OrderedBinaryTree(pnodes)
+                            if ind == ParallelUtilities.topnoderank(OBT)
+                                # Special check for 2 hosts as
+                                # there's only one node in the top tree
+                                @test node.parent == ParallelUtilities.topnode(tree.toptree).p
+                            else
+                                @test node.parent == pnodes[parentnoderank(OBT, ind)]
+                            end
+                        end
+                        for (ind, rank) in enumerate(mid + 1:imax)
+                            node = tree[rank + 1]
+                            parentnode = tree[parentnoderank(tree, rank + 1)]
+                            @test parentnode.p == node.parent
+                            pnodes = workersonhosts["host2"]
+                            @test node.p == pnodes[ind]
+                            OBT = OrderedBinaryTree(pnodes)
+                            if ind == ParallelUtilities.topnoderank(OBT)
+                                # Special check for 2 hosts as
+                                # there's only one node in the top tree
+                                @test node.parent == ParallelUtilities.topnode(tree.toptree).p
+                            else
+                                @test node.parent == pnodes[parentnoderank(OBT, ind)]
+                            end
+                        end
+                    end
+                end;
+
+                @testset "nchildren" begin
+                    procs = 1:2
+                    tree = SegmentedOrderedBinaryTree(procs,
+                        OrderedDict("host1" => 1:1,"host2" => 2:2))
+                    @test nchildren(tree, 1) == nchildren(tree[1]) == tree[1].nchildren == 2
+                    @test nchildren(tree, 2) == nchildren(tree[2]) == tree[2].nchildren == 0
+                    @test nchildren(tree, 3) == nchildren(tree[3]) == tree[3].nchildren == 0
+                    @test_throws BoundsError(tree, 0) nchildren(tree, 0)
+                    @test_throws BoundsError(tree, 4) nchildren(tree, 4)
+
+                    procs = 1:12
+                    tree = SegmentedOrderedBinaryTree(procs,
+                        OrderedDict(
+                            "host1" => 1:3,"host2" => 4:6,
+                            "host3" => 7:9,"host4" => 10:12))
+                    @test nchildren(tree, 1) == nchildren(tree[1]) == tree[1].nchildren == 2
+                    @test nchildren(tree, 2) == nchildren(tree[2]) == tree[2].nchildren == 2
+                    @test nchildren(tree, 3) == nchildren(tree[3]) == tree[3].nchildren == 2
+                    @test nchildren(tree, 4) == nchildren(tree[4]) == tree[4].nchildren == 0
+                    @test nchildren(tree, 5) == nchildren(tree[5]) == tree[5].nchildren == 2
+                    @test nchildren(tree, 6) == nchildren(tree[6]) == tree[6].nchildren == 0
+                    @test nchildren(tree, 7) == nchildren(tree[7]) == tree[7].nchildren == 0
+                    @test nchildren(tree, 8) == nchildren(tree[8]) == tree[8].nchildren == 2
+                    @test nchildren(tree, 9) == nchildren(tree[9]) == tree[9].nchildren == 0
+                    @test nchildren(tree, 10) == nchildren(tree[10]) == tree[10].nchildren == 0
+                    @test nchildren(tree, 11) == nchildren(tree[11]) == tree[11].nchildren == 2
+                    @test nchildren(tree, 12) == nchildren(tree[12]) == tree[12].nchildren == 0
+                    @test nchildren(tree, 13) == nchildren(tree[13]) == tree[13].nchildren == 0
+                    @test nchildren(tree, 14) == nchildren(tree[14]) == tree[14].nchildren == 2
+                    @test nchildren(tree, 15) == nchildren(tree[15]) == tree[15].nchildren == 0
+                    @test_throws BoundsError(tree, 0) nchildren(tree, 0)
+                    @test_throws BoundsError(tree, 16) nchildren(tree, 16)
+                end;
+            end;
+        end
+    end
+end;
+
+@testsetwithinfo "reduction" begin
+
+    @testset "BranchChannel" begin
+        @test_throws DomainError BranchChannel(1, 3)
+        parentchannel = RemoteChannel(() -> Channel(1))
+        @test_throws DomainError BranchChannel(1, parentchannel, 3)
+    end
+
+    @testset "TopTreeNode" begin
+        # Special test for this as this is usually not called when tests are carried out on the same machine
+        parentchannel = RemoteChannel(() -> Channel(1))
+        childrenchannel = RemoteChannel(() -> Channel(2))
+        pipe = ParallelUtilities.BranchChannel(1, parentchannel, childrenchannel, 2)
+
+        put!(childrenchannel, ParallelUtilities.pval(1, false, 1))
+        put!(childrenchannel, ParallelUtilities.pval(2, false, 2))
+
+        redval = reducedvalue(+, ParallelUtilities.TopTreeNode(1), pipe, nothing)
+        @test redval === ParallelUtilities.pval(1, false, 3)
+
+        put!(childrenchannel, ParallelUtilities.pval(1, false, 1))
+        put!(childrenchannel, ParallelUtilities.pval(2, true, nothing))
+
+        redval = reducedvalue(+, ParallelUtilities.TopTreeNode(1), pipe, nothing)
+        @test redval === ParallelUtilities.pval(1, true, nothing)
+
+        put!(childrenchannel, ParallelUtilities.pval(1, false, 1))
+        put!(childrenchannel, ParallelUtilities.pval(2, false, 2))
+        @test_throws Exception reducedvalue(x -> error(""), ParallelUtilities.TopTreeNode(1), pipe, nothing)
+    end
+
+    @testset "fake multiple hosts" begin
+        tree = ParallelUtilities.SegmentedOrderedBinaryTree([1,1], OrderedDict("host1" => 1:1, "host2" => 1:1))
+        branches = ParallelUtilities.createbranchchannels(tree)
+        @test ParallelUtilities.pmapreduceworkers(x -> 1, +, (tree, branches), (1:4,)) == 4
+
+        if nworkers() > 1
+            p = procs_node()
+            # Choose workers on the same node to avoid communication bottlenecks in testing
+            w = first(values(p))
+            tree = ParallelUtilities.SegmentedOrderedBinaryTree(w, OrderedDict("host1" => w[1]:w[1], "host2" => w[2]:w[end]))
+            branches = ParallelUtilities.createbranchchannels(tree)
+            @test ParallelUtilities.pmapreduceworkers(x -> 1, +, (tree, branches), (1:length(w),)) == length(w)
+        end
+    end
+end
+
+@testset "pmapreduce" begin
+    @testsetwithinfo "pmapreduce" begin
+        @testsetwithinfo "sum" begin
+            @testsetwithinfo "comparison with mapreduce" begin
+                for iterators in Any[(1:1,), (ones(2,2),), (1:10,)]
+                    res_exp = mapreduce(x -> x^2, +, iterators...)
+                    res = pmapreduce(x -> x^2, +, iterators...)
+                    @test res_exp == res
+
+                    res_exp = mapreduce(x -> x^2, +, iterators..., init = 100)
+                    res = pmapreduce(x -> x^2, +, iterators..., init = 100)
+                    @test res_exp == res
+                end
+
+                @testset "dictionary" begin
+                    res = pmapreduce(x -> Dict(x => x), merge, 1:1)
+                    res_exp = mapreduce(x -> Dict(x => x), merge, 1:1)
+                    @test res == res_exp
+
+                    res = pmapreduce(x -> Dict(x => x), merge, 1:200)
+                    res_exp = mapreduce(x -> Dict(x => x), merge, 1:200)
+                    @test res == res_exp
+
+                    res = pmapreduce(x -> OrderedDict(x => x), merge, 1:20)
+                    res_exp = mapreduce(x -> OrderedDict(x => x), merge, 1:20)
+                    @test res == res_exp
+                end
+
+                iterators = (1:10, 2:2:20)
+                res_exp = mapreduce((x, y) -> x*y, +, iterators...)
+                res = pmapreduce((x, y) -> x*y, +, iterators...)
+                @test res_exp == res
+
+                res_exp = mapreduce((x, y) -> x*y, +, iterators..., init = 100)
+                res = pmapreduce((x, y) -> x*y, +, iterators..., init = 100)
+                @test res_exp == res
+
+                iterators = (1:10, 2:2:20)
+                iterators_product = Iterators.product(iterators...)
+                res_exp = mapreduce(((x, y),) -> x*y, +, iterators_product)
+                res = pmapreduce(((x, y),) -> x*y, +, iterators_product)
+                @test res_exp == res
+
+                res_exp_2itp = mapreduce(((x, y), (a, b)) -> x*a + y*b, +, iterators_product, iterators_product)
+                res_2itp = pmapreduce(((x, y), (a, b)) -> x*a + y*b, +, iterators_product, iterators_product)
+                @test res_2itp == res_exp_2itp
+
+                iterators_product_putil = ParallelUtilities.product(iterators...)
+                res_exp2 = mapreduce(((x, y),) -> x*y, +, iterators_product_putil)
+                res2 = pmapreduce(((x, y),) -> x*y, +, iterators_product_putil)
+                @test res_exp2 == res2
+                @test res_exp2 == res_exp
+
+                res_exp_2pup = mapreduce(((x, y), (a, b)) -> x*a + y*b, +, iterators_product_putil, iterators_product_putil)
+                res_2pup = pmapreduce(((x, y), (a, b)) -> x*a + y*b, +, iterators_product_putil, iterators_product_putil)
+                @test res_2pup == res_exp_2pup
+                @test res_2pup == res_2itp
+            end
+
+            @testsetwithinfo "pmapreduce_productsplit" begin
+                res_exp = sum(workers())
+                @test pmapreduce_productsplit(x -> myid(), +, 1:nworkers()) == res_exp
+                @test pmapreduce_productsplit(NoSplat(x -> myid()), +, 1:nworkers()) == res_exp
+                @test pmapreduce_productsplit(x -> myid(), +, 1:nworkers(), 1:1) == res_exp
+            end
+        end;
+
+        @testsetwithinfo "inplace assignment" begin
+            res = pmapreduce_productsplit(x -> ones(2), ParallelUtilities.elementwisesum!, 1:10)
+            resexp = mapreduce(x -> ones(2), +, 1:min(10, nworkers()))
+            @test res == resexp
+
+            res = pmapreduce_productsplit(x -> ones(2), ParallelUtilities.elementwiseproduct!, 1:4)
+            resexp = mapreduce(x -> ones(2), (x,y) -> x .* y, 1:min(4, nworkers()))
+            @test res == resexp
+
+            res = pmapreduce_productsplit(x -> ones(2), ParallelUtilities.elementwisemin!, 1:4)
+            resexp = mapreduce(x -> ones(2), (x,y) -> min.(x,y), 1:min(4, nworkers()))
+            @test res == resexp
+
+            res = pmapreduce_productsplit(x -> ones(2), ParallelUtilities.elementwisemax!, 1:4)
+            resexp = mapreduce(x -> ones(2), (x,y) -> max.(x,y), 1:min(4, nworkers()))
+            @test res == resexp
+        end
+
+        @testsetwithinfo "concatenation" begin
+            @testsetwithinfo "comparison with mapreduce" begin
+                resexp_vcat = mapreduce(identity, vcat, 1:nworkers())
+                resexp_hcat = mapreduce(identity, hcat, 1:nworkers())
+                res_vcat = pmapreduce(identity, vcat, 1:nworkers())
+                res_hcat = pmapreduce(identity, hcat, 1:nworkers())
+                @test res_vcat == resexp_vcat
+                @test res_hcat == resexp_hcat
+            end
+
+            @testsetwithinfo "pmapreduce_productsplit" begin
+                res_vcat = mapreduce(identity, vcat, ones(2) for i in 1:nworkers())
+                res_hcat = mapreduce(identity, hcat, ones(2) for i in 1:nworkers())
+
+                @test pmapreduce_productsplit(x -> ones(2), vcat, 1:nworkers()) == res_vcat
+                @test pmapreduce_productsplit(x -> ones(2), hcat, 1:nworkers()) == res_hcat
+            end
+        end;
+
+        @testsetwithinfo "run elsewhere" begin
+            @testsetwithinfo "sum" begin
+                res_exp = sum(workers())
+                c = Channel(nworkers())
+                tasks = Vector{Task}(undef, nworkers())
+                @sync begin
+                    for (ind, p) in enumerate(workers())
+                        tasks[ind] = @async begin
+                            try
+                                res = @fetchfrom p pmapreduce_productsplit(x -> myid(), +, 1:nworkers())
+                                put!(c,(ind, res, false))
+                            catch
+                                put!(c,(ind, 0, true))
+                                rethrow()
+                            end
+                        end
+                    end
+                    for i = 1:nworkers()
+                        ind, res, err = take!(c)
+                        err && wait(tasks[ind])
+                        @test res == res_exp
+                        showworkernumber(i, nworkers())
+                    end
+                end
+            end
+            # concatenation where the rank is used in the mapping function
+            # Preserves order of the iterators
+            @testsetwithinfo "concatenation using rank" begin
+                c = Channel(nworkers())
+                tasks = Vector{Task}(undef, nworkers())
+                @sync begin
+                    for (ind, p) in enumerate(workers())
+                        tasks[ind] = @async begin
+                            try
+                                res = @fetchfrom p (pmapreduce_productsplit(x -> x[1][1], vcat, 1:nworkers()) == mapreduce(identity, vcat, 1:nworkers()))
+                                put!(c,(ind, res, false))
+                            catch
+                                put!(c,(ind, false, true))
+                                rethrow()
+                            end
+                        end
+                    end
+                    for i = 1:nworkers()
+                        ind, res, err = take!(c)
+                        err && wait(tasks[ind])
+                        @test res
+                        showworkernumber(i, nworkers())
+                    end
+                end
+            end
+        end;
+
+        @testsetwithinfo "errors" begin
+            @test_throws Exception pmapreduce(x -> error("map"), +, 1:10)
+            @test_throws Exception pmapreduce(identity, x -> error("reduce"), 1:10)
+            @test_throws Exception pmapreduce(x -> error("map"), x -> error("reduce"), 1:10)
+
+            @test_throws Exception pmapreduce(fmap, +, 1:10)
+            @test_throws Exception pmapreduce(identity, fred, 1:10)
+            @test_throws Exception pmapreduce(fmap, fred, 1:10)
+
+            if nworkers() != nprocs()
+                @test_throws Exception pmapreduce(fmap_local, +, 1:10)
+                @test_throws Exception pmapreduce(identity, fred_local, 1:10)
+                @test_throws Exception pmapreduce(fmap_local, fred, 1:10)
+                @test_throws Exception pmapreduce(fmap_local, fred_local, 1:10)
+            end
+        end;
+    end;
+    @testsetwithinfo "pmapbatch" begin
+        for (iterators, fmap) in Any[
+            ((1:1,), x -> 1),
+            ((1:10,), x -> 1),
+            ((1:5,), x -> ones(1) * x),
+            ((1:10, 1:10), (x,y) -> ones(3) * (x+y))]
+
+            res = pmapbatch(fmap, iterators...)
+            res_exp = pmap(fmap, iterators...)
+            @test res == res_exp
+        end
+
+        v = pmapbatch_productsplit(x -> sum(sum(i) for i in x) * ones(2), 1:1, 1:1)
+        @test v == [[2.0, 2.0]]
+        v = pmapbatch_productsplit(x -> ParallelUtilities.workerrank(x), 1:nworkers(), 1:nworkers())
+        @test v == [1:nworkers();]
+    end
+end;
diff --git a/test/productsplit.jl b/test/productsplit.jl
new file mode 100644
index 0000000..9430c1b
--- /dev/null
+++ b/test/productsplit.jl
@@ -0,0 +1,454 @@
+using Distributed
+using Test
+using ParallelUtilities
+import ParallelUtilities: ProductSplit, ProductSection,
+minimumelement, maximumelement, extremaelement, nelements, dropleading, indexinproduct,
+extremadims, localindex, extrema_commonlastdim, whichproc, procrange_recast, whichproc_localindex,
+getiterators, _niterators
+
+macro testsetwithinfo(str, ex)
+    quote
+        @info "Testing "*$str
+        @testset $str begin $(esc(ex)); end;
+    end
+end
+
+@testsetwithinfo "AbstractConstrainedProduct" begin
+
+    various_iters = Any[(1:10,), (1:1:10,), (1:10, 4:6), (1:1:10, 4:6), (1:10, 4:6, 1:4), (1:2:9,), (1:2:9, 4:1:6),
+                    (1:2, Base.OneTo(4), 1:3:10), (1:0.5:3, 2:4)]
+
+    @testsetwithinfo "ProductSplit" begin
+
+        function split_across_processors_iterators(arr::Iterators.ProductIterator, num_procs, proc_id)
+
+            num_tasks = length(arr);
+
+            num_tasks_per_process, num_tasks_leftover = divrem(num_tasks, num_procs)
+
+            num_tasks_on_proc = num_tasks_per_process + (proc_id <= mod(num_tasks, num_procs) ? 1 : 0 );
+            task_start = num_tasks_per_process*(proc_id-1) + min(num_tasks_leftover, proc_id-1) + 1;
+
+            Iterators.take(Iterators.drop(arr, task_start-1), num_tasks_on_proc)
+        end
+
+        function split_product_across_processors_iterators(arrs_tuple, num_procs, proc_id)
+            split_across_processors_iterators(Iterators.product(arrs_tuple...), num_procs, proc_id)
+        end
+
+        @testset "Constructor" begin
+
+            function checkPSconstructor(iters, npmax = 10)
+                ntasks_total = prod(length, iters)
+                for np = 1:npmax, p = 1:np
+                    ps = ProductSplit(iters, np, p)
+                    @test eltype(ps) == Tuple{map(eltype, iters)...}
+                    @test _niterators(ps) == length(iters)
+                    if !isempty(ps)
+                        @test collect(ps) == collect(split_product_across_processors_iterators(iters, np, p))
+                    end
+                    @test prod(length, getiterators(ps)) == ntasks_total
+                    @test ParallelUtilities.workerrank(ps) == p
+                    @test nworkers(ps) == np
+                end
+
+                @test_throws ArgumentError ProductSplit(iters, npmax, npmax + 1)
+            end
+
+            @testset "0D" begin
+                @test_throws ArgumentError ProductSplit((), 2, 1)
+            end
+
+            @testset "cumprod" begin
+                @test ParallelUtilities._cumprod(1,()) == ()
+                @test ParallelUtilities._cumprod(1,(2,)) == (1,)
+                @test ParallelUtilities._cumprod(1,(2, 3)) == (1, 2)
+                @test ParallelUtilities._cumprod(1,(2, 3, 4)) == (1, 2, 6)
+            end
+
+            @testset "1D" begin
+                iters = (1:10,)
+                checkPSconstructor(iters)
+            end
+            @testset "2D" begin
+                iters = (1:10, 4:6)
+                checkPSconstructor(iters)
+            end
+            @testset "3D" begin
+                iters = (1:10, 4:6, 1:4)
+                checkPSconstructor(iters)
+            end
+            @testset "steps" begin
+                iters = (1:2:10, 4:1:6)
+                checkPSconstructor(iters)
+            end
+            @testset "mixed" begin
+                for iters in [(1:2, 4:2:6), (1:2, Base.OneTo(4), 1:3:10)]
+                    checkPSconstructor(iters)
+                end
+            end
+
+            @testset "empty" begin
+                iters = (1:1,)
+                ps = ProductSplit(iters, 10, 2)
+                @test isempty(ps)
+                @test length(ps) == 0
+            end
+
+            @testset "first and last ind" begin
+                for iters in Any[(1:10,), (1:2, Base.OneTo(4), 1:3:10)]
+                    ps = ProductSplit(iters, 2, 1)
+                    @test firstindex(ps) == 1
+                    @test ParallelUtilities.firstindexglobal(ps) == 1
+                    @test ParallelUtilities.lastindexglobal(ps) == div(prod(length, iters), 2)
+                    @test lastindex(ps) == div(prod(length, iters), 2)
+                    @test lastindex(ps) == length(ps)
+                    ps = ProductSplit(iters, 2, 2)
+                    @test ParallelUtilities.firstindexglobal(ps) == div(prod(length, iters), 2) + 1
+                    @test firstindex(ps) == 1
+                    @test ParallelUtilities.lastindexglobal(ps) == prod(length, iters)
+                    @test lastindex(ps) == length(ps)
+
+                    for np in prod(length, iters) + 1:prod(length, iters) + 10,
+                        p in prod(length, iters) + 1:np
+
+                        ps = ProductSplit(iters, np, p)
+                        @test ParallelUtilities.firstindexglobal(ps) == prod(length, iters) + 1
+                        @test ParallelUtilities.lastindexglobal(ps) == prod(length, iters)
+                    end
+                end
+            end
+        end
+
+        @testset "firstlast" begin
+            @testset "first" begin
+
+                @test ParallelUtilities._first(()) == ()
+
+                for iters in various_iters, np = 1:prod(length, iters)
+
+                    ps = ProductSplit(iters, np, 1)
+                    @test first(ps) == map(first, iters)
+                end
+            end
+            @testset "last" begin
+
+                @test ParallelUtilities._last(()) == ()
+
+                for iters in various_iters, np = 1:prod(length, iters)
+
+                    ps = ProductSplit(iters, np, np)
+                    @test last(ps) == map(last, iters)
+                end
+            end
+        end
+
+        @testset "extrema" begin
+
+            @testset "min max extrema" begin
+                function checkPSextrema(iters, (fn_el, fn), npmax = 10)
+                    for np = 1:npmax, p = 1:np
+                        ps = ProductSplit(iters, np, p)
+                        if isempty(ps)
+                            continue
+                        end
+                        pcol = collect(ps)
+                        for dims in 1:length(iters)
+                            @test begin
+                                res = fn_el(ps, dims = dims) == fn(x[dims] for x in pcol)
+                                if !res
+                                    show(ps)
+                                end
+                                res
+                            end
+                        end
+                        if _niterators(ps) == 1
+                            @test begin
+                                res = fn_el(ps) == fn(x[1] for x in pcol)
+                                if !res
+                                    show(ps)
+                                end
+                                res
+                            end
+                        end
+                    end
+                end
+
+                for iters in various_iters,
+                    fntup in [(maximumelement, maximum), (minimumelement, minimum), (extremaelement, extrema)]
+                    checkPSextrema(iters, fntup)
+                end
+
+                @test minimumelement(ProductSplit((1:5,), 2, 1)) == 1
+                @test maximumelement(ProductSplit((1:5,), 2, 1)) == 3
+                @test extremaelement(ProductSplit((1:5,), 2, 1)) == (1, 3)
+
+                @test minimumelement(ProductSplit((1:5,), 2, 2)) == 4
+                @test maximumelement(ProductSplit((1:5,), 2, 2)) == 5
+                @test extremaelement(ProductSplit((1:5,), 2, 2)) == (4, 5)
+            end
+
+            @testset "extremadims" begin
+                ps = ProductSplit((1:10,), 2, 1)
+                @test ParallelUtilities._extremadims(ps, 1,()) == ()
+                for iters in various_iters
+                    dims = length(iters)
+                    for np = 1:prod(length, iters) + 1, proc_id = 1:np
+                        ps = ProductSplit(iters, np, proc_id)
+                        if isempty(ps)
+                            @test_throws ArgumentError extremadims(ps)
+                        else
+                            ext = Tuple(map(extrema, zip(collect(ps)...)))
+                            @test extremadims(ps) == ext
+                        end
+                    end
+                end
+            end
+
+            @testset "extrema_commonlastdim" begin
+                iters = (1:10, 4:6, 1:4)
+                ps = ProductSplit(iters, 37, 8)
+                @test extrema_commonlastdim(ps) == ([(9, 1), (6, 1)], [(2, 2), (4, 2)])
+                ps = ProductSplit(iters, prod(length, iters) + 1, prod(length, iters) + 1)
+                @test extrema_commonlastdim(ps) === nothing
+            end
+        end
+
+        @testset "in" begin
+
+            function checkifpresent(iters, npmax = 10)
+                for np = 1:npmax, p = 1:np
+                    ps = ProductSplit(iters, np, p)
+                    if isempty(ps)
+                        continue
+                    end
+                    pcol = collect(ps)
+
+                    for el in pcol
+                        # It should be contained in this iterator
+                        @test el in ps
+                        for p2 in 1:np
+                            # It should not be contained anywhere else
+                            p2 == p && continue
+                            ps2 = ProductSplit(iters, np, p2)
+                            @test !(el in ps2)
+                        end
+                    end
+                end
+            end
+
+            for iters in various_iters
+                checkifpresent(iters)
+            end
+
+            @test ParallelUtilities._infullrange((), ())
+        end
+
+        @testset "whichproc + procrange_recast" begin
+            np, proc_id = 5, 5
+            iters = (1:10, 4:6, 1:4)
+            ps = ProductSplit(iters, np, proc_id)
+            @test whichproc(iters, first(ps), 1) == 1
+            @test whichproc(ps, first(ps)) == proc_id
+            @test whichproc(ps, last(ps)) == proc_id
+            @test whichproc(iters,(100, 100, 100), 1) === nothing
+            @test procrange_recast(iters, ps, 1) == 1:1
+            @test procrange_recast(ps, 1) == 1:1
+
+            smalleriter = (1:1, 1:1, 1:1)
+            err = ParallelUtilities.TaskNotPresentError(smalleriter, first(ps))
+            @test_throws err procrange_recast(smalleriter, ps, 1)
+            smalleriter = (7:9, 4:6, 1:4)
+            err = ParallelUtilities.TaskNotPresentError(smalleriter, last(ps))
+            @test_throws err procrange_recast(smalleriter, ps, 1)
+
+            iters = (1:1, 2:2)
+            ps = ProductSplit(iters, np, proc_id)
+            @test procrange_recast(iters, ps, 2) == nothing
+            @test procrange_recast(ps, 2) == nothing
+
+            iters = (1:1, 2:2)
+            ps = ProductSplit(iters, 1, 1)
+            @test procrange_recast(iters, ps, 2) == 1:1
+            @test procrange_recast(ps, 2) == 1:1
+
+            iters = (Base.OneTo(2), 2:4)
+            ps = ProductSplit(iters, 2, 1)
+            @test procrange_recast(iters, ps, 1) == 1:1
+            @test procrange_recast(iters, ps, 2) == 1:1
+            @test procrange_recast(iters, ps, prod(length, iters)) == 1:length(ps)
+
+            for np_new in 1:prod(length, iters)
+                for proc_id_new = 1:np_new
+                    ps_new = ProductSplit(iters, np_new, proc_id_new)
+
+                    for val in ps_new
+                        # Should loop only if ps_new is non-empty
+                        @test whichproc(iters, val, np_new) == proc_id_new
+                    end
+                end
+                @test procrange_recast(iters, ps, np_new) == (isempty(ps) ? nothing : (whichproc(iters, first(ps), np_new):whichproc(iters, last(ps), np_new)))
+                @test procrange_recast(ps, np_new) == (isempty(ps) ? nothing : (whichproc(iters, first(ps), np_new):whichproc(iters, last(ps), np_new)))
+            end
+
+            @testset "different set" begin
+                iters = (1:100, 1:4000)
+                ps = ProductSplit((20:30, 1:1), 2, 1)
+                @test procrange_recast(iters, ps, 700) == 1:1
+                ps = ProductSplit((20:30, 1:1), 2, 2)
+                @test procrange_recast(iters, ps, 700) == 1:1
+
+                iters = (1:1, 2:2)
+                ps = ProductSplit((20:30, 2:2), 2, 1)
+                @test_throws ParallelUtilities.TaskNotPresentError procrange_recast(iters, ps, 3)
+                ps = ProductSplit((1:30, 2:2), 2, 1)
+                @test_throws ParallelUtilities.TaskNotPresentError procrange_recast(iters, ps, 3)
+            end
+        end
+
+        @testset "indexinproduct" begin
+            @test indexinproduct((1:4, 2:3:8), (3, 5)) == 7
+            @test indexinproduct((1:4, 2:3:8), (3, 6)) === nothing
+            @test_throws ArgumentError indexinproduct((), ())
+        end
+
+        @testset "localindex" begin
+            for iters in various_iters
+                for np = 1:prod(length, iters), proc_id = 1:np
+                    ps = ProductSplit(iters, np, proc_id)
+                    for (ind, val) in enumerate(ps)
+                        @test localindex(ps, val) == ind
+                    end
+                end
+            end
+        end
+
+        @testset "whichproc_localindex" begin
+            for iters in various_iters
+                iters isa Tuple{AbstractUnitRange, Vararg{AbstractUnitRange}} || continue
+                for np = 1:prod(length, iters), proc_id = 1:np
+                    ps_col = collect(ProductSplit(iters, np, proc_id))
+                    ps_col_rev = [reverse(t) for t in ps_col]
+                    for val in ps_col
+                        p, ind = whichproc_localindex(iters, val, np)
+                        @test p == proc_id
+                        ind_in_arr = searchsortedfirst(ps_col_rev, reverse(val))
+                        @test ind == ind_in_arr
+                    end
+                end
+            end
+        end
+
+        @testset "getindex" begin
+
+            @test ParallelUtilities._getindex((), 1) == ()
+            @test ParallelUtilities._getindex((), 1, 2) == ()
+
+            @test ParallelUtilities.childindex((), 1) == (1,)
+
+            for iters in various_iters
+                for np = 1:prod(length, iters), p = 1:np
+                    ps = ProductSplit(iters, np, p)
+                    ps_col = collect(ps)
+                    for i in 1:length(ps)
+                        @test ps[i] == ps_col[i]
+                    end
+                    @test ps[end] == ps[length(ps)]
+                    for ind in [0, length(ps) + 1]
+                       @test_throws ParallelUtilities.BoundsError(ps, ind) ps[ind]
+                    end
+                end
+            end
+        end
+    end
+    @testsetwithinfo "ProductSection" begin
+        @testset "Constructor" begin
+            function testPS(iterators)
+                itp = collect(Iterators.product(iterators...))
+                l = length(itp)
+                for startind in 1:l, endind in startind:l
+                    ps = ProductSection(iterators, startind:endind)
+                    @test eltype(ps) == Tuple{map(eltype, iterators)...}
+                    for (psind, ind) in enumerate(startind:endind)
+                        @test ps[psind] == itp[ind]
+                    end
+                end
+            end
+
+            for iter in various_iters
+                testPS(iter)
+            end
+
+            @test_throws ArgumentError ProductSection((), 2:3)
+        end
+    end
+    @testset "dropleading" begin
+        ps = ProductSplit((1:5, 2:4, 1:3), 7, 3);
+        @test dropleading(ps) isa ProductSection
+        @test collect(dropleading(ps)) == [(4, 1), (2, 2), (3, 2)]
+        @test collect(dropleading(dropleading(ps))) == [(1,), (2,)]
+
+        ps = ProductSection((1:5, 2:4, 1:3), 5:8);
+        @test dropleading(ps) isa ProductSection
+        @test collect(dropleading(ps)) == [(2, 1), (3, 1)]
+        @test collect(dropleading(dropleading(ps))) == [(1,)]
+    end
+    @testset "nelements" begin
+        ps = ProductSplit((1:5, 2:4, 1:3), 7, 3);
+        @test nelements(ps, dims = 1) == 5
+        @test nelements(ps, dims = 2) == 3
+        @test nelements(ps, dims = 3) == 2
+        @test_throws ArgumentError nelements(ps, dims = 0)
+        @test_throws ArgumentError nelements(ps, dims = 4)
+
+        ps = ProductSection((1:5, 2:4, 1:3), 5:8);
+        @test nelements(ps, dims =1) == 4
+        @test nelements(ps, dims =2) == 2
+        @test nelements(ps, dims =3) == 1
+
+        ps = ProductSection((1:5, 2:4, 1:3), 5:11);
+        @test nelements(ps, dims = 1) == 5
+        @test nelements(ps, dims = 2) == 3
+        @test nelements(ps, dims = 3) == 1
+
+        ps = ProductSection((1:5, 2:4, 1:3), 4:8);
+        @test nelements(ps, dims = 1) == 5
+        @test nelements(ps, dims = 2) == 2
+        @test nelements(ps, dims = 3) == 1
+
+        ps = ProductSection((1:5, 2:4, 1:3), 4:9);
+        @test nelements(ps, dims = 1) == 5
+        @test nelements(ps, dims = 2) == 2
+        @test nelements(ps, dims = 3) == 1
+    end
+
+    @test ParallelUtilities._checknorollover((), (), ())
+end;
+
+@testset "ReverseLexicographicTuple" begin
+    @testset "isless" begin
+        a = ParallelUtilities.ReverseLexicographicTuple((1, 2, 3))
+        b = ParallelUtilities.ReverseLexicographicTuple((2, 2, 3))
+        @test a < b
+        @test a <= b
+        b = ParallelUtilities.ReverseLexicographicTuple((1, 1, 3))
+        @test b < a
+        @test b <= a
+        b = ParallelUtilities.ReverseLexicographicTuple((2, 1, 3))
+        @test b < a
+        @test b <= a
+        b = ParallelUtilities.ReverseLexicographicTuple((2, 1, 4))
+        @test a < b
+        @test a <= b
+    end
+    @testset "equal" begin
+        a = ParallelUtilities.ReverseLexicographicTuple((1, 2, 3))
+        @test a == a
+        @test isequal(a, a)
+        @test a <= a
+        b = ParallelUtilities.ReverseLexicographicTuple(a.t)
+        @test a == b
+        @test isequal(a, b)
+        @test a <= b
+    end
+end;
diff --git a/test/singlehost.jl b/test/singlehost.jl
index 30dd03f..22fb07a 100644
--- a/test/singlehost.jl
+++ b/test/singlehost.jl
@@ -1,8 +1,16 @@
 using Distributed
 
-const workersused = 8
-addprocs(workersused)
+include("misctests_singleprocess.jl")
+include("productsplit.jl")
+include("paralleltests.jl")
 
-include("tests.jl")
+for workersused in [1, 2, 4, 8]
+	addprocs(workersused)
+
+	try
+		include("paralleltests.jl")
+	finally
+		rmprocs(workers())
+	end
+end
 
-rmprocs(workers())
diff --git a/test/tests.jl b/test/tests.jl
deleted file mode 100644
index b39f63d..0000000
--- a/test/tests.jl
+++ /dev/null
@@ -1,2665 +0,0 @@
-@everywhere begin
-    using DataStructures
-    using Test
-    using ParallelUtilities
-    import ParallelUtilities: BinaryTreeNode, RemoteChannelContainer, BranchChannel, 
-	Sorted, Unsorted, Ordering, pval, value, reducedvalue, reduceTreeNode, mapTreeNode,
-    SequentialBinaryTree, OrderedBinaryTree, SegmentedSequentialBinaryTree,
-    SegmentedOrderedBinaryTree,
-    parentnoderank, nchildren,
-    maybepvalput!, createbranchchannels, nworkersactive, workersactive,
-    procs_node, leafrankfoldedtree,
-    TopTreeNode, SubTreeNode, ProductSection, indexinproduct, dropleading,
-    nelements, getiterators, firstindexglobal, lastindexglobal
-end
-
-const future_release_warn = r"will not be exported in a future release"i
-
-@test isempty(Test.detect_ambiguities(Base, Core, ParallelUtilities))
-
-macro testsetwithinfo(str,ex)
-    quote
-        @info "Testing "*$str
-        @testset $str begin $(esc(ex)); end;
-    end
-end
-
-function showworkernumber(ind,nw)
-    # Cursor starts off at the beginning of the line
-    print("\u1b[K") # clear till end of line
-    print("Testing on worker $ind of $nw")
-    # return the cursor to the beginning of the line
-    endchar = ind == nw ? "\n" : "\r"
-    print(endchar)
-end
-
-@testsetwithinfo "AbstractConstrainedProduct" begin
-
-    various_iters = [(1:10,),(1:10,4:6),(1:10,4:6,1:4),(1:2:10,4:1:6),
-                    (1:2,Base.OneTo(4),1:3:10),(1:0.5:3,2:4)]
-
-    @testsetwithinfo "ProductSplit" begin
-
-    	function split_across_processors_iterators(arr::Iterators.ProductIterator,num_procs,proc_id)
-
-    	    num_tasks = length(arr);
-
-    	    num_tasks_per_process,num_tasks_leftover = divrem(num_tasks,num_procs)
-
-    	    num_tasks_on_proc = num_tasks_per_process + (proc_id <= mod(num_tasks,num_procs) ? 1 : 0 );
-    	    task_start = num_tasks_per_process*(proc_id-1) + min(num_tasks_leftover,proc_id-1) + 1;
-
-    	    Iterators.take(Iterators.drop(arr,task_start-1),num_tasks_on_proc)
-    	end
-
-    	function split_product_across_processors_iterators(arrs_tuple,num_procs,proc_id)
-    		split_across_processors_iterators(Iterators.product(arrs_tuple...),num_procs,proc_id)
-    	end
-
-        @testset "Constructor" begin
-
-    	    function checkPSconstructor(iters,npmax=10)
-    	    	ntasks_total = prod(length, iters)
-    			for np = 1:npmax, p = 1:np
-    		        ps = ProductSplit(iters, np, p)
-                    @test eltype(ps) == Tuple{eltype.(iters)...}
-                    @test ndims(ps) == length(iters)
-    		        @test collect(ps) == collect(split_product_across_processors_iterators(iters,np,p))
-    		        @test (@test_deprecated ntasks(ps)) == ntasks_total
-    		        @test prod(length, getiterators(ps)) == ntasks_total
-                    @test ParallelUtilities.workerrank(ps) == p
-    		    end
-
-    		    @test_throws ArgumentError ProductSplit(iters, npmax, npmax+1)
-    		end
-
-    		@testset "0D" begin
-    		    @test_throws ArgumentError ProductSplit((),2,1)
-    		end
-
-            @testset "cumprod" begin
-                @test ParallelUtilities._cumprod(1,()) == ()
-                @test ParallelUtilities._cumprod(1,(2,)) == (1,)
-                @test ParallelUtilities._cumprod(1,(2,3)) == (1,2)
-                @test ParallelUtilities._cumprod(1,(2,3,4)) == (1,2,6)
-            end
-
-        	@testset "1D" begin
-    	    	iters = (1:10,)
-    	    	checkPSconstructor(iters)
-        	end
-        	@testset "2D" begin
-    	    	iters = (1:10,4:6)
-    	    	checkPSconstructor(iters)
-        	end
-        	@testset "3D" begin
-    	    	iters = (1:10,4:6,1:4)
-    	    	checkPSconstructor(iters)
-        	end
-        	@testset "steps" begin
-    	    	iters = (1:2:10,4:1:6)
-    	    	checkPSconstructor(iters)
-    	    	iters = (10:-1:10,6:-2:0)
-    	    	@test_throws ArgumentError ProductSplit(iters,3,2)
-        	end
-        	@testset "mixed" begin
-        	    for iters in [(1:2,4:2:6),(1:2,Base.OneTo(4),1:3:10)]
-    	    		checkPSconstructor(iters)
-    	    	end
-        	end
-
-        	@testset "empty" begin
-        	    iters = (1:1,)
-        	    ps = ProductSplit(iters,10,2)
-        	    @test isempty(ps)
-        	    @test length(ps) == 0
-        	end
-
-        	@testset "first and last ind" begin
-        	    for iters in [(1:10,),(1:2,Base.OneTo(4),1:3:10)]
-    	    	    ps = ProductSplit(iters,2,1)
-    	    	    @test firstindex(ps) == 1
-    	    	    @test firstindexglobal(ps) == 1
-    	    	    @test lastindexglobal(ps) == div(prod(length, iters),2)
-    	    	    @test lastindex(ps) == div(prod(length, iters),2)
-    	    	    @test lastindex(ps) == length(ps)
-    	    	    ps = ProductSplit(iters,2,2)
-    	    	    @test firstindexglobal(ps) == div(prod(length, iters),2) + 1
-    	    	    @test firstindex(ps) == 1
-    	    	    @test lastindexglobal(ps) == prod(length, iters)
-    	    	    @test lastindex(ps) == length(ps)
-
-    	    	    for np in prod(length, iters)+1:prod(length, iters)+10,
-    	    	    	p in prod(length, iters)+1:np
-
-    		    	    ps = ProductSplit(iters,np,p)
-    		    	    @test firstindexglobal(ps) == prod(length, iters) + 1
-    		    	    @test lastindexglobal(ps) == prod(length, iters)
-    		    	end
-    		    end
-        	end
-
-            @testset "summary and show" begin
-                iters = (1:3, 4:5:19)
-                ps = ProductSplit(iters,3,2)
-                reprstr = "ProductSplit("*repr(iters)*", 3, 2)"
-                @test ParallelUtilities.mwerepr(ps) == reprstr
-
-                summarystr = "$(length(ps))-element "*reprstr
-                @test occursin(summarystr, ParallelUtilities.summary(ps))
-
-                io = IOBuffer()
-                summary(io,ps)
-                @test occursin(summarystr, String(take!(io)))
-
-                show(io, ps)
-                @test occursin(summarystr, String(take!(io)))
-
-                ps = ParallelUtilities.ProductSection(iters,4:5)
-                reprstr = "ProductSection("*repr(iters)*", " * repr(4:5) * ")"
-                @test ParallelUtilities.mwerepr(ps) == reprstr
-            end
-        end
-
-        @testset "firstlast" begin
-            @testset "first" begin
-
-            	@test ParallelUtilities._first(()) == ()
-
-                for iters in various_iters,np=1:5prod(length, iters)
-
-    	            ps = ProductSplit(iters,np,1)
-    	            @test first(ps) == ( isempty(ps) ? nothing : map(first,iters) )
-    	        end
-
-    	        iters = (1:1,)
-    	        ps = ProductSplit(iters,2prod(length, iters),prod(length, iters)+1) # must be empty
-    	        @test first(ps) === nothing
-            end
-            @testset "last" begin
-
-            	@test ParallelUtilities._last(()) == ()
-
-                for iters in various_iters,np=1:5prod(length, iters)
-
-    	            ps = ProductSplit(iters,np,np)
-    	            @test last(ps) == ( isempty(ps) ? nothing : map(last,iters) )
-    	        end
-
-    	        iters = (1:1,)
-    	        ps = ProductSplit(iters,2length(iters[1]),length(iters[1])+1) # must be empty
-    	        @test last(ps) === nothing
-            end
-        end
-
-        @testset "extrema" begin
-
-        	@testset "min max extrema" begin
-    	    	function checkPSextrema(iters,fn::Function,npmax=10)
-    				for np = 1:npmax, p = 1:np
-    			        ps = ProductSplit(iters,np,p)
-    			        pcol = collect(ps)
-    			        for dim in 1:length(iters)
-    			        	@test begin
-    			        		res = fn(ps,dim=dim) == fn(x[dim] for x in pcol)
-                                @test (@test_deprecated fn(ps, dim)) == fn(ps, dim=dim)
-    			        		if !res
-    			        			println(summary(ps))
-    			        		end
-    			        		res
-    			        	end
-    			        end
-    			    end
-    			end
-
-    		    for iters in various_iters,	fn in [maximum,minimum,extrema]
-    		        checkPSextrema(iters,fn)
-    		    end
-
-                @test minimum(ProductSplit((1:5,),2,1)) == 1
-                @test maximum(ProductSplit((1:5,),2,1)) == 3
-                @test extrema(ProductSplit((1:5,),2,1)) == (1,3)
-
-                @test minimum(ProductSplit((1:5,),2,2)) == 4
-                @test maximum(ProductSplit((1:5,),2,2)) == 5
-                @test extrema(ProductSplit((1:5,),2,2)) == (4,5)
-        	end
-
-        	@testset "extremadims" begin
-        		ps = ProductSplit((1:10,),2,1)
-        		@test ParallelUtilities._extremadims(ps,1,()) == ()
-        		for iters in various_iters
-
-        			dims = length(iters)
-    	    		for np = 1:5prod(length, iters), proc_id = 1:np
-    	    	    	ps = ProductSplit(iters,np,proc_id)
-    	    	    	if isempty(ps)
-    	    	    		@test (@test_deprecated future_release_warn extremadims(ps)) == Tuple(nothing for i=1:dims)
-    	    	    	else
-    		    	    	ext = Tuple(map(extrema,zip(collect(ps)...)))
-    		    	    	@test (@test_deprecated future_release_warn extremadims(ps)) == ext
-    		    	    end
-    	    	    end
-    	    	end
-        	end
-
-        	@testset "extrema_commonlastdim" begin
-        	    iters = (1:10,4:6,1:4)
-        	    ps = ProductSplit(iters,37,8)
-        	    @test (@test_deprecated future_release_warn extrema_commonlastdim(ps)) == ([(9,1),(6,1)],[(2,2),(4,2)])
-        	    ps = ProductSplit(iters,prod(length, iters)+1,prod(length, iters)+1)
-        	    @test (@test_deprecated future_release_warn extrema_commonlastdim(ps)) === nothing
-        	end
-        end
-
-        @testset "in" begin
-
-        	function checkifpresent(iters,npmax=10)
-        		for np = 1:npmax, p = 1:np
-    		        ps = ProductSplit(iters,np,p)
-    		        pcol = collect(ps)
-
-    		        for el in pcol
-    		        	# It should be contained in this iterator
-    		        	@test el in ps
-    		        	for p2 in 1:np
-    		        		# It should not be contained anywhere else
-    		        		p2 == p && continue
-    		        		ps2 = ProductSplit(iters,np,p2)
-    		        		@test !(el in ps2)
-    		        	end
-    		        end
-    		    end
-        	end
-
-            for iters in various_iters
-    	        checkifpresent(iters)
-    	    end
-
-    	    @test ParallelUtilities._infullrange((),())
-        end
-
-        @testset "whichproc + procrange_recast" begin
-            np,proc_id = 5,5
-            iters = (1:10,4:6,1:4)
-            ps = ProductSplit(iters,np,proc_id)
-            @test whichproc(iters,first(ps),1) == 1
-            @test whichproc(ps,first(ps)) == proc_id
-            @test whichproc(ps,last(ps)) == proc_id
-            @test whichproc(iters,(100,100,100),1) === nothing
-            @test (@test_deprecated future_release_warn procrange_recast(iters,ps,1)) == 1:1
-            @test (@test_deprecated future_release_warn procrange_recast(ps,1)) == 1:1
-
-            smalleriter = (1:1,1:1,1:1)
-            err = ParallelUtilities.TaskNotPresentError(smalleriter,first(ps))
-            @test_deprecated future_release_warn @test_throws err procrange_recast(smalleriter,ps,1)
-            smalleriter = (7:9,4:6,1:4)
-            err = ParallelUtilities.TaskNotPresentError(smalleriter,last(ps))
-            @test_deprecated future_release_warn @test_throws err procrange_recast(smalleriter,ps,1)
-
-            iters = (1:1,2:2)
-            ps = ProductSplit(iters,np,proc_id)
-            @test whichproc(iters,first(ps),np) === nothing
-            @test whichproc(iters,nothing,np) === nothing
-            @test (@test_deprecated future_release_warn procrange_recast(iters,ps,2)) == (0:-1)
-            @test (@test_deprecated future_release_warn procrange_recast(ps,2)) == (0:-1)
-
-            iters = (1:1,2:2)
-            ps = ProductSplit(iters,1,1)
-            @test (@test_deprecated future_release_warn procrange_recast(iters,ps,2)) == 1:1
-            @test (@test_deprecated future_release_warn procrange_recast(ps,2)) == 1:1
-
-            iters = (Base.OneTo(2),2:4)
-            ps = ProductSplit(iters,2,1)
-            @test (@test_deprecated future_release_warn procrange_recast(iters,ps,1)) == 1:1
-            @test (@test_deprecated future_release_warn procrange_recast(iters,ps,2)) == 1:1
-            @test (@test_deprecated future_release_warn procrange_recast(iters,ps, prod(length, iters))) == 1:length(ps)
-
-            for np_new in 1:5prod(length, iters)
-            	for proc_id_new=1:np_new
-    	        	ps_new = ProductSplit(iters,np_new,proc_id_new)
-
-    	        	for val in ps_new
-    	        		# Should loop only if ps_new is non-empty
-    	        		@test whichproc(iters,val,np_new) == proc_id_new
-    	        	end
-    	        end
-    	        procid_new_first = whichproc(iters,first(ps),np_new)
-    	        proc_new_last = whichproc(iters,last(ps),np_new)
-            	@test (@test_deprecated future_release_warn procrange_recast(iters,ps,np_new)) == (isempty(ps) ? (0:-1) : (procid_new_first:proc_new_last))
-            	@test (@test_deprecated future_release_warn procrange_recast(ps,np_new)) == (isempty(ps) ? (0:-1) : (procid_new_first:proc_new_last))
-            end
-
-            @testset "different set" begin
-    	        iters = (1:100,1:4000)
-    	        ps = ProductSplit((20:30,1:1),2,1)
-    	        @test (@test_deprecated future_release_warn procrange_recast(iters,ps,700)) == 1:1
-    	        ps = ProductSplit((20:30,1:1),2,2)
-    	        @test (@test_deprecated future_release_warn procrange_recast(iters,ps,700)) == 1:1
-
-    	        iters = (1:1,2:2)
-    	        ps = ProductSplit((20:30,2:2),2,1)
-    	        @test_deprecated future_release_warn @test_throws ParallelUtilities.TaskNotPresentError procrange_recast(iters,ps,3)
-    	        ps = ProductSplit((1:30,2:2),2,1)
-    	        @test_deprecated future_release_warn @test_throws ParallelUtilities.TaskNotPresentError procrange_recast(iters,ps,3)
-            end
-        end
-
-        @testset "indexinproduct" begin
-            @test indexinproduct((1:4,2:3:8),(3,5)) == 7
-            @test indexinproduct((1:4,2:3:8),(3,6)) === nothing
-            @test_throws ArgumentError indexinproduct((),())
-        end
-
-        @testset "localindex" begin
-            
-            for iters in various_iters
-    	        for np=1:5prod(length, iters),proc_id=1:np
-    	        	ps = ProductSplit(iters,np,proc_id)
-    	        	for (ind,val) in enumerate(ps)
-    	        		@test localindex(ps,val) == ind
-    	        		@test localindex(iters,val,np,proc_id) == ind
-    	        	end
-    	        	if isempty(ps)
-    	        		@test localindex(ps,first(ps)) === nothing
-    	        	end
-    	        end
-    	    end
-        end
-
-        @testset "whichproc_localindex" begin
-            for iters in various_iters
-    	        for np=1:prod(length, iters),proc_id=1:np
-    	        	ps_col = collect(ProductSplit(iters,np,proc_id))
-    	        	ps_col_rev = [reverse(t) for t in ps_col] 
-    	        	for val in ps_col
-    	        		p,ind = whichproc_localindex(iters,val,np)
-    	        		@test p == proc_id
-    	        		ind_in_arr = searchsortedfirst(ps_col_rev,reverse(val))
-    	        		@test ind == ind_in_arr
-    	        	end
-    	        end
-    	    end
-        end
-
-        @testset "getindex" begin
-        	
-        	@test ParallelUtilities._getindex((),1) == ()
-        	@test ParallelUtilities._getindex((),1,2) == ()
-
-        	@test ParallelUtilities.childindex((),1) == (1,)
-
-            for iters in various_iters
-                for np=1:prod(length, iters),p=1:np
-                	ps = ProductSplit(iters,np,p)
-                	ps_col = collect(ps)
-                	for i in 1:length(ps)
-                		@test ps[i] == ps_col[i]
-                	end
-                	@test ps[end] == ps[length(ps)]
-                    for ind in [0,length(ps)+1]
-                	   @test_throws ParallelUtilities.BoundsError(ps,ind) ps[ind]
-                    end
-                end
-            end
-        end
-    end
-    @testsetwithinfo "ProductSection" begin
-        @testset "Constructor" begin
-            function testPS(iterators)
-                itp = collect(Iterators.product(iterators...))
-                l = length(itp)
-                for startind in 1:l, endind in startind:l
-                    ps = ProductSection(iterators, startind:endind)
-                    @test eltype(ps) == Tuple{eltype.(iterators)...}
-                    for (psind,ind) in enumerate(startind:endind)
-                        @test ps[psind] == itp[ind]
-                    end
-                end
-            end
-
-            for iter in various_iters
-                testPS(iter)
-            end
-
-            @test_throws ArgumentError ProductSection((),2:3)
-        end
-    end
-    @testset "dropleading" begin
-        ps = ProductSplit((1:5,2:4,1:3),7,3);
-        @test dropleading(ps) isa ProductSection
-        @test collect(dropleading(ps)) == [(4,1),(2,2),(3,2)]
-        @test collect(dropleading(dropleading(ps))) == [(1,),(2,)]
-
-        ps = ProductSection((1:5,2:4,1:3),5:8);
-        @test dropleading(ps) isa ProductSection
-        @test collect(dropleading(ps)) == [(2,1),(3,1)]
-        @test collect(dropleading(dropleading(ps))) == [(1,)]
-    end
-    @testset "nelements" begin
-        ps = ProductSplit((1:5,2:4,1:3),7,3);
-        @test nelements(ps, dim = 1) == 5
-        @test nelements(ps, dim = 2) == 3
-        @test nelements(ps, dim = 3) == 2
-        @test_throws ArgumentError nelements(ps, dim = 0)
-        @test_throws ArgumentError nelements(ps, dim = 4)
-
-        ps = ProductSection((1:5,2:4,1:3),5:8);
-        @test (@test_deprecated nelements(ps,1)) == nelements(ps, dim = 1)
-        @test nelements(ps, dim =1) == 4
-        @test nelements(ps, dim =2) == 2
-        @test nelements(ps, dim =3) == 1
-
-        ps = ProductSection((1:5,2:4,1:3),5:11);
-        @test nelements(ps, dim = 1) == 5
-        @test nelements(ps, dim = 2) == 3
-        @test nelements(ps, dim = 3) == 1
-
-        ps = ProductSection((1:5,2:4,1:3),4:8);
-        @test nelements(ps, dim = 1) == 5
-        @test nelements(ps, dim = 2) == 2
-        @test nelements(ps, dim = 3) == 1
-
-        ps = ProductSection((1:5,2:4,1:3),4:9);
-        @test nelements(ps, dim = 1) == 5
-        @test nelements(ps, dim = 2) == 2
-        @test nelements(ps, dim = 3) == 1
-    end
-    
-    @test ParallelUtilities._checknorollover((),(),())
-    @test ParallelUtilities.c2l_rec(3,1,(),()) == 3
-end;
-
-@testset "ReverseLexicographicTuple" begin
-    @testset "isless" begin
-    	a = ParallelUtilities.ReverseLexicographicTuple((1,2,3))
-        b = ParallelUtilities.ReverseLexicographicTuple((2,2,3))
-        @test a < b
-        @test a <= b
-        b = ParallelUtilities.ReverseLexicographicTuple((1,1,3))
-        @test b < a
-        @test b <= a
-        b = ParallelUtilities.ReverseLexicographicTuple((2,1,3))
-        @test b < a
-        @test b <= a
-        b = ParallelUtilities.ReverseLexicographicTuple((2,1,4))
-        @test a < b
-        @test a <= b
-    end
-    @testset "equal" begin
-        a = ParallelUtilities.ReverseLexicographicTuple((1,2,3))
-        @test a == a
-        @test isequal(a,a)
-        @test a <= a
-        b = ParallelUtilities.ReverseLexicographicTuple(a.t)
-        @test a == b
-        @test isequal(a,b)
-        @test a <= b
-    end
-end;
-
-@testset "utilities" begin
-    @testset "workers active" begin
-        @test nworkersactive((1:1,)) == 1
-        @test nworkersactive((1:2,)) == min(2,nworkers())
-        @test nworkersactive((1:1,1:2)) == min(2,nworkers())
-        @test nworkersactive(1:2) == min(2,nworkers())
-        @test nworkersactive(1:1,1:2) == min(2,nworkers())
-        @test nworkersactive((1:nworkers()+1,)) == nworkers()
-        @test nworkersactive(1:nworkers()+1) == nworkers()
-    	@test workersactive((1:1,)) == workers()[1:1]
-    	@test workersactive(1:1) == workers()[1:1]
-    	@test workersactive(1:1,1:1) == workers()[1:1]
-        @test workersactive((1:2,)) == workers()[1:min(2,nworkers())]
-        @test workersactive((1:1,1:2)) == workers()[1:min(2,nworkers())]
-        @test workersactive(1:1,1:2) == workers()[1:min(2,nworkers())]
-        @test workersactive((1:nworkers()+1,)) == workers()
-        @test workersactive(1:nworkers()+1) == workers()
-
-        ps = ProductSplit((1:10,),nworkers(),1)
-        @test nworkersactive(ps) == min(10,nworkers())
-
-        iters = (1:1,1:2)
-        ps = ProductSplit(iters,2,1)
-        @test nworkersactive(ps) == nworkersactive(iters)
-        @test workersactive(ps) == workersactive(iters)
-    end
-
-    @testset "hostnames" begin
-        @test (@test_deprecated gethostnames()) == hostnames()
-    	hosts = hostnames()
-    	nodes = unique(hosts)
-        @test hosts == [@fetchfrom p Libc.gethostname() for p in workers()]
-        @test (@test_deprecated future_release_warn nodenames()) == nodes
-        @test (@test_deprecated future_release_warn nodenames(hosts)) == nodes
-        np1 = @test_deprecated future_release_warn nprocs_node(hosts,nodes)
-        np2 = @test_deprecated future_release_warn nprocs_node(hosts)
-        np3 = @test_deprecated future_release_warn nprocs_node()
-        @test np1 == np2 == np3
-        for node in nodes
-            npnode = count(isequal(node),hosts)
-            @test np1[node] == npnode
-        end
-        p1 = @test_deprecated future_release_warn procs_node()
-        p2 = @test_deprecated future_release_warn procs_node(workers(), hosts, nodes)
-        @test p1 == p2
-        for node in nodes
-            pnode = workers()[findall(isequal(node),hosts)]
-            @test p1[node] == pnode
-        end
-        np4 = @test_deprecated future_release_warn nprocs_node(p1)
-        @test np1 == np4
-    end
-end;
-
-@testset "BinaryTree" begin
-    @testsetwithinfo "BinaryTreeNode" begin
-    	@testset "Constructor" begin
-	    	p = workers()[1]
-	    	b = BinaryTreeNode(p,p,0)
-	        @test nchildren(b) == 0
-	        b = BinaryTreeNode(p,p,1)
-	        @test nchildren(b) == 1
-	        b = BinaryTreeNode(p,p,2)
-	        @test nchildren(b) == 2
-
-	        @test_throws DomainError BinaryTreeNode(p,p,3)
-            @test_throws DomainError BinaryTreeNode(p,p,-1)
-    	end
-    end
-
-    @testsetwithinfo "BinaryTree" begin
-        @testsetwithinfo "SequentialBinaryTree" begin
-            @testset "pid and parent" begin
-                for imax = 1:100
-                    procs = 1:imax
-                    tree = SequentialBinaryTree(procs)
-                    @test length(tree) == length(procs) 
-                    topnoderank = ParallelUtilities.topnoderank(tree)
-                    @test topnoderank == 1
-                    @test tree[topnoderank] == ParallelUtilities.topnode(tree)
-                    @test tree[1].parent == 1
-                    for rank in 1:length(tree)
-                        node = tree[rank]
-                        @test node.p == procs[rank]
-                        @test node.parent == procs[parentnoderank(tree,rank)]
-                    end
-
-                    for ind in [0,imax+1]
-                        @test_throws BoundsError(tree,ind) parentnoderank(tree,ind)
-                        @test_throws BoundsError(tree,ind) tree[ind]
-                    end
-                end
-            end
-                
-            @testset "nchildren" begin
-                tree = SequentialBinaryTree(1:1)
-                @test nchildren(tree,1) == nchildren(tree[1]) == tree[1].nchildren == 0
-                @test_throws BoundsError(tree,0) nchildren(tree,0)
-                @test_throws BoundsError(tree,2) nchildren(tree,2)
-
-                tree = SequentialBinaryTree(1:2)
-                @test nchildren(tree,1) == nchildren(tree[1]) == tree[1].nchildren == 1
-                @test nchildren(tree,2) == nchildren(tree[2]) == tree[2].nchildren == 0
-                @test_throws BoundsError(tree,0) nchildren(tree,0)
-                @test_throws BoundsError(tree,3) nchildren(tree,3)
-
-                tree = SequentialBinaryTree(1:8)
-                @test nchildren(tree,1) == nchildren(tree[1]) == tree[1].nchildren == 2
-                @test nchildren(tree,2) == nchildren(tree[2]) == tree[2].nchildren == 2
-                @test nchildren(tree,3) == nchildren(tree[3]) == tree[3].nchildren == 2
-                @test nchildren(tree,4) == nchildren(tree[4]) == tree[4].nchildren == 1
-                @test nchildren(tree,5) == nchildren(tree[5]) == tree[5].nchildren == 0
-                @test nchildren(tree,6) == nchildren(tree[6]) == tree[6].nchildren == 0
-                @test nchildren(tree,7) == nchildren(tree[7]) == tree[7].nchildren == 0
-                @test nchildren(tree,8) == nchildren(tree[8]) == tree[8].nchildren == 0
-                @test_throws BoundsError(tree,0) nchildren(tree,0)
-                @test_throws BoundsError(tree,9) nchildren(tree,9)
-            end
-
-            @testset "level" begin
-                tree = SequentialBinaryTree(1:15)
-                @test ParallelUtilities.levels(tree) == 4
-
-                @test ParallelUtilities.levelfromtop(tree,1) == 1
-                @test ParallelUtilities.levelfromtop.((tree,),2:3) == ones(Int,2)*2
-                @test ParallelUtilities.levelfromtop.((tree,),4:7) == ones(Int,4)*3
-                @test ParallelUtilities.levelfromtop.((tree,),8:15) == ones(Int,8)*4
-
-                for p in [0,length(tree)+1]
-                    @test_throws BoundsError(tree,p) ParallelUtilities.levelfromtop(tree,p)
-                end
-            end
-
-            @testset "summary" begin
-                tree = SequentialBinaryTree(1:4)
-                io = IOBuffer()
-                summary(io,tree)
-                strexp = "$(length(tree))-node $(typeof(tree))"
-                @test String(take!(io)) == strexp
-                @test summary(tree) == strexp
-            end
-        end
-
-        @testsetwithinfo "OrderedBinaryTree" begin
-            @testset "pid and parent" begin
-                for imax = 1:100
-                    procs = 1:imax
-                    tree = OrderedBinaryTree(procs)
-                    @test length(tree) == length(procs)
-
-                    topnoderank = ParallelUtilities.topnoderank(tree)
-                    @test tree[topnoderank].parent == topnoderank
-                    for rank in 1:length(tree)
-                        node = tree[rank]
-                        @test node.p == procs[rank]
-                        @test node.parent == procs[parentnoderank(tree,rank)]
-                    end
-                    @test_throws BoundsError(tree,0) parentnoderank(tree,0)
-                    @test_throws BoundsError(tree,imax+1) parentnoderank(tree,imax+1)
-                end
-            end
-
-            @testset "nchildren" begin
-                tree = OrderedBinaryTree(1:1)
-                @test nchildren(tree,1) == nchildren(tree[1]) == tree[1].nchildren == 0
-                @test_throws BoundsError(tree,0) nchildren(tree,0)
-                @test_throws BoundsError(tree,2) nchildren(tree,2)
-                @test ParallelUtilities.topnoderank(tree) == 1
-
-                tree = OrderedBinaryTree(1:2)
-                @test nchildren(tree,1) == nchildren(tree[1]) == tree[1].nchildren == 0
-                @test nchildren(tree,2) == nchildren(tree[2]) == tree[2].nchildren == 1
-                @test_throws BoundsError(tree,0) nchildren(tree,0)
-                @test_throws BoundsError(tree,3) nchildren(tree,3)
-                @test ParallelUtilities.topnoderank(tree) == 2
-
-                tree = OrderedBinaryTree(1:8)
-                @test nchildren(tree,1) == nchildren(tree[1]) == tree[1].nchildren == 0
-                @test nchildren(tree,2) == nchildren(tree[2]) == tree[2].nchildren == 2
-                @test nchildren(tree,3) == nchildren(tree[3]) == tree[3].nchildren == 0
-                @test nchildren(tree,4) == nchildren(tree[4]) == tree[4].nchildren == 2
-                @test nchildren(tree,5) == nchildren(tree[5]) == tree[5].nchildren == 0
-                @test nchildren(tree,6) == nchildren(tree[6]) == tree[6].nchildren == 2
-                @test nchildren(tree,7) == nchildren(tree[7]) == tree[7].nchildren == 0
-                @test nchildren(tree,8) == nchildren(tree[8]) == tree[8].nchildren == 1
-                @test_throws BoundsError(tree,0) nchildren(tree,0)
-                @test_throws BoundsError(tree,9) nchildren(tree,9)
-                @test ParallelUtilities.topnoderank(tree) == 8
-
-                tree = OrderedBinaryTree(1:11)
-                @test nchildren(tree,1) == nchildren(tree[1]) == tree[1].nchildren == 0
-                @test nchildren(tree,2) == nchildren(tree[2]) == tree[2].nchildren == 2
-                @test nchildren(tree,3) == nchildren(tree[3]) == tree[3].nchildren == 0
-                @test nchildren(tree,4) == nchildren(tree[4]) == tree[4].nchildren == 2
-                @test nchildren(tree,5) == nchildren(tree[5]) == tree[5].nchildren == 0
-                @test nchildren(tree,6) == nchildren(tree[6]) == tree[6].nchildren == 2
-                @test nchildren(tree,7) == nchildren(tree[7]) == tree[7].nchildren == 0
-                @test nchildren(tree,8) == nchildren(tree[8]) == tree[8].nchildren == 2
-                @test nchildren(tree,9) == nchildren(tree[9]) == tree[9].nchildren == 0
-                @test nchildren(tree,10) == nchildren(tree[10]) == tree[10].nchildren == 2
-                @test nchildren(tree,11) == nchildren(tree[11]) == tree[11].nchildren == 0
-                @test_throws BoundsError(tree,0) nchildren(tree,0)
-                @test_throws BoundsError(tree,12) nchildren(tree,12)
-                @test ParallelUtilities.topnoderank(tree) == 8
-
-                tree = OrderedBinaryTree(1:13)
-                @test nchildren(tree,1) == nchildren(tree[1]) == tree[1].nchildren == 0
-                @test nchildren(tree,2) == nchildren(tree[2]) == tree[2].nchildren == 2
-                @test nchildren(tree,3) == nchildren(tree[3]) == tree[3].nchildren == 0
-                @test nchildren(tree,4) == nchildren(tree[4]) == tree[4].nchildren == 2
-                @test nchildren(tree,5) == nchildren(tree[5]) == tree[5].nchildren == 0
-                @test nchildren(tree,6) == nchildren(tree[6]) == tree[6].nchildren == 2
-                @test nchildren(tree,7) == nchildren(tree[7]) == tree[7].nchildren == 0
-                @test nchildren(tree,8) == nchildren(tree[8]) == tree[8].nchildren == 2
-                @test nchildren(tree,9) == nchildren(tree[9]) == tree[9].nchildren == 0
-                @test nchildren(tree,10) == nchildren(tree[10]) == tree[10].nchildren == 2
-                @test nchildren(tree,11) == nchildren(tree[11]) == tree[11].nchildren == 0
-                @test nchildren(tree,12) == nchildren(tree[12]) == tree[12].nchildren == 2
-                @test nchildren(tree,13) == nchildren(tree[13]) == tree[13].nchildren == 0
-                @test_throws BoundsError(tree,0) nchildren(tree,0)
-                @test_throws BoundsError(tree,14) nchildren(tree,14)
-                @test ParallelUtilities.topnoderank(tree) == 8
-            end
-
-            @testset "level" begin
-                tree = OrderedBinaryTree(1:15)
-                @test ParallelUtilities.levels(tree) == 4
-
-                @test ParallelUtilities.levelfromtop.((tree,),1:2:15) == ones(Int,8).*4
-                @test ParallelUtilities.levelfromtop.((tree,),(2,6,10,14)) == (3,3,3,3)
-                @test ParallelUtilities.levelfromtop.((tree,),(4,12)) == (2,2)
-                @test ParallelUtilities.levelfromtop(tree,8) == 1
-                for p in [0,length(tree)+1]
-                    @test_throws BoundsError(tree,p) ParallelUtilities.levelfromtop(tree,p)
-                end
-
-                tree = OrderedBinaryTree(1:13)
-                @test ParallelUtilities.levels(tree) == 4
-                @test ParallelUtilities.levelfromtop.((tree,),1:2:11) == ones(Int,6).*4
-                @test ParallelUtilities.levelfromtop.((tree,),(2,6,10,13)) == (3,3,3,3)
-                @test ParallelUtilities.levelfromtop.((tree,),(4,12)) == (2,2)
-                @test ParallelUtilities.levelfromtop(tree,8) == 1
-                for p in [0,length(tree)+1]
-                    @test_throws BoundsError(tree,p) ParallelUtilities.levelfromtop(tree,p)
-                end
-            end
-        end
-
-        @testsetwithinfo "SegmentedSequentialBinaryTree" begin
-            @testsetwithinfo "single host" begin
-                @testset "pid and parent" begin
-                    for imax = 1:100
-                        procs = 1:imax
-                        workersonhosts = Dict("host"=>procs)
-                        tree = SegmentedSequentialBinaryTree(procs,workersonhosts)
-                        SBT = SequentialBinaryTree(procs)
-                        @test length(tree) == length(procs) == length(SBT)
-
-                        topnoderank = ParallelUtilities.topnoderank(tree)
-                        @test topnoderank == 1
-                        @test tree[topnoderank] == ParallelUtilities.topnode(tree)
-                        @test tree[1].parent == 1
-                        for rank in 1:length(tree)
-                            node = tree[rank]
-                            parentnode = tree[parentnoderank(tree,rank)]
-                            @test length(procs) > 1 ? nchildren(parentnode) > 0 : nchildren(parentnode) == 0
-                            @test node.p == procs[rank]
-                            @test node.parent == procs[parentnoderank(SBT,rank)]
-                            @test parentnode.p == node.parent
-                        end
-                    end
-                end;
-
-                @testset "nchildren" begin
-                    procs = 1:1
-                    tree = SegmentedSequentialBinaryTree(procs,Dict("host"=>procs))
-                    @test nchildren(tree,1) == nchildren(tree[1]) == tree[1].nchildren == 0
-                    @test_throws BoundsError(tree,0) nchildren(tree,0)
-                    @test_throws BoundsError(tree,2) nchildren(tree,2)
-
-                    procs = 1:2
-                    tree = SegmentedSequentialBinaryTree(procs,Dict("host"=>procs))
-                    @test nchildren(tree,1) == nchildren(tree[1]) == tree[1].nchildren == 1
-                    @test nchildren(tree,2) == nchildren(tree[2]) == tree[2].nchildren == 0
-                    @test_throws BoundsError(tree,0) nchildren(tree,0)
-                    @test_throws BoundsError(tree,3) nchildren(tree,3)
-
-                    procs = 1:8
-                    tree = SegmentedSequentialBinaryTree(procs,Dict("host"=>procs))
-                    @test nchildren(tree,1) == nchildren(tree[1]) == tree[1].nchildren == 2
-                    @test nchildren(tree,2) == nchildren(tree[2]) == tree[2].nchildren == 2
-                    @test nchildren(tree,3) == nchildren(tree[3]) == tree[3].nchildren == 2
-                    @test nchildren(tree,4) == nchildren(tree[4]) == tree[4].nchildren == 1
-                    @test nchildren(tree,5) == nchildren(tree[5]) == tree[5].nchildren == 0
-                    @test nchildren(tree,6) == nchildren(tree[6]) == tree[6].nchildren == 0
-                    @test nchildren(tree,7) == nchildren(tree[7]) == tree[7].nchildren == 0
-                    @test nchildren(tree,8) == nchildren(tree[8]) == tree[8].nchildren == 0
-                    @test_throws BoundsError(tree,0) nchildren(tree,0)
-                    @test_throws BoundsError(tree,9) nchildren(tree,9)
-                end;
-            end;
-
-            @testsetwithinfo "multiple hosts" begin
-                @testset "length" begin
-                    procs = 1:2
-                    tree = SegmentedSequentialBinaryTree(procs,
-                        OrderedDict("host1"=>1:1,"host2"=>2:2))
-                    @test length(tree) == 2 + 1
-
-                    procs = 1:4
-                    tree = SegmentedSequentialBinaryTree(procs,
-                        OrderedDict("host1"=>1:2,"host2"=>3:4))
-
-                    @test length(tree) == 4 + 1
-
-                    procs = 1:12
-                    tree = SegmentedSequentialBinaryTree(procs,
-                        OrderedDict(
-                            "host1"=>1:3,"host2"=>4:6,
-                            "host3"=>7:9,"host4"=>10:12))
-
-                    @test length(tree) == 12 + 3 
-                end;
-
-                @testset "leafrankfoldedtree" begin
-                    treeflag = SequentialBinaryTree(1:1)
-                    @test leafrankfoldedtree(treeflag,5,1) == 8
-                    @test leafrankfoldedtree(treeflag,5,2) == 9
-                    @test leafrankfoldedtree(treeflag,5,3) == 5
-                    @test leafrankfoldedtree(treeflag,5,4) == 6
-                    @test leafrankfoldedtree(treeflag,5,5) == 7
-                end;
-
-                @testset "pid and parent" begin
-                    for imax = 2:100
-                        procs = 1:imax
-                        mid = div(imax,2)
-                        workersonhosts = OrderedDict{String,Vector{Int}}()
-                        workersonhosts["host1"] = procs[1:mid]
-                        workersonhosts["host2"] = procs[mid+1:end]
-                        tree = SegmentedSequentialBinaryTree(procs,workersonhosts)
-
-                        topnoderank = ParallelUtilities.topnoderank(tree)
-                        @test topnoderank == 1
-                        @test tree[topnoderank] == ParallelUtilities.topnode(tree)
-                        @test tree[1].parent == 1
-                        @test parentnoderank(tree,1) == 1
-                        for (ind,rank) in enumerate(1:mid)
-                            node = tree[rank+1]
-                            parentnode = tree[parentnoderank(tree,rank+1)]
-                            @test nchildren(parentnode) > 0
-                            @test parentnode.p == node.parent
-                            pnodes = workersonhosts["host1"]
-                            @test node.p == pnodes[ind]
-                            SBT = SequentialBinaryTree(pnodes)
-                            if ind == 1
-                                @test node.parent == 1
-                            else
-                                @test node.parent == pnodes[parentnoderank(SBT,ind)]
-                            end
-                        end
-                        for (ind,rank) in enumerate(mid+1:imax)
-                            node = tree[rank+1]
-                            parentnode = tree[parentnoderank(tree,rank+1)]
-                            @test nchildren(parentnode) > 0
-                            @test parentnode.p == node.parent
-                            pnodes = workersonhosts["host2"]
-                            @test node.p == pnodes[ind]
-                            SBT = SequentialBinaryTree(pnodes)
-                            if ind == 1
-                                @test node.parent == 1
-                            else
-                                @test node.parent == pnodes[parentnoderank(SBT,ind)]
-                            end
-                        end
-                    end
-                end;
-
-                @testset "nchildren" begin
-                    procs = 1:2
-                    tree = SegmentedSequentialBinaryTree(procs,
-                        OrderedDict("host1"=>1:1,"host2"=>2:2))
-                    @test nchildren(tree,1) == nchildren(tree[1]) == tree[1].nchildren == 2
-                    @test nchildren(tree,2) == nchildren(tree[2]) == tree[2].nchildren == 0
-                    @test nchildren(tree,3) == nchildren(tree[3]) == tree[3].nchildren == 0
-                    @test_throws BoundsError(tree,0) nchildren(tree,0)
-                    @test_throws BoundsError(tree,4) nchildren(tree,4)
-
-                    procs = 1:12
-                    tree = SegmentedSequentialBinaryTree(procs,
-                        OrderedDict(
-                            "host1"=>1:3,"host2"=>4:6,
-                            "host3"=>7:9,"host4"=>10:12))
-                    @test nchildren(tree,1) == nchildren(tree[1]) == tree[1].nchildren == 2
-                    @test nchildren(tree,2) == nchildren(tree[2]) == tree[2].nchildren == 2
-                    @test nchildren(tree,3) == nchildren(tree[3]) == tree[3].nchildren == 2
-                    @test nchildren(tree,4) == nchildren(tree[4]) == tree[4].nchildren == 2
-                    @test nchildren(tree,5) == nchildren(tree[5]) == tree[5].nchildren == 0
-                    @test nchildren(tree,6) == nchildren(tree[6]) == tree[6].nchildren == 0
-                    @test nchildren(tree,7) == nchildren(tree[7]) == tree[7].nchildren == 2
-                    @test nchildren(tree,8) == nchildren(tree[8]) == tree[8].nchildren == 0
-                    @test nchildren(tree,9) == nchildren(tree[9]) == tree[9].nchildren == 0
-                    @test nchildren(tree,10) == nchildren(tree[10]) == tree[10].nchildren == 2
-                    @test nchildren(tree,11) == nchildren(tree[11]) == tree[11].nchildren == 0
-                    @test nchildren(tree,12) == nchildren(tree[12]) == tree[12].nchildren == 0
-                    @test nchildren(tree,13) == nchildren(tree[13]) == tree[13].nchildren == 2
-                    @test nchildren(tree,14) == nchildren(tree[14]) == tree[14].nchildren == 0
-                    @test nchildren(tree,15) == nchildren(tree[15]) == tree[15].nchildren == 0
-                    @test_throws BoundsError(tree,0) nchildren(tree,0)
-                    @test_throws BoundsError(tree,16) nchildren(tree,16)
-                end;
-            end;
-
-            @testset "fulltree-toptree indexing" begin
-                procs = 1:5
-                tree = SequentialBinaryTree(procs)
-                @test ParallelUtilities.toptree_to_fulltree_index(tree,3) == 3
-                @test ParallelUtilities.fulltree_to_toptree_index(tree,3) == 3
-            end
-        end
-
-        @testsetwithinfo "SegmentedOrderedBinaryTree" begin
-            @testsetwithinfo "single host" begin
-                @testset "pid and parent" begin
-                    for imax = 1:100
-                        procs = 1:imax
-                        workersonhosts = Dict("host"=>procs)
-                        tree = SegmentedOrderedBinaryTree(procs,workersonhosts)
-                        treeOBT = OrderedBinaryTree(procs)
-                        @test length(tree) == length(procs) == length(treeOBT)
-
-                        topnoderank = ParallelUtilities.topnoderank(tree)
-                        # The top node is its own parent
-                        @test tree[topnoderank].parent == topnoderank
-                        @test tree[topnoderank] == ParallelUtilities.topnode(tree)
-                        for rank in 1:length(tree)
-                            node = tree[rank]
-                            parentnode = tree[parentnoderank(tree,rank)]
-                            @test length(procs) > 1 ? nchildren(parentnode) > 0 : nchildren(parentnode) == 0
-                            @test node.p == procs[rank]
-                            @test node.parent == procs[parentnoderank(treeOBT,rank)]
-                            @test parentnode.p == node.parent
-                        end
-                    end
-                end;
-
-                @testset "nchildren" begin
-                    procs = 1:1
-                    tree = SegmentedOrderedBinaryTree(procs,Dict("host"=>procs))
-                    @test nchildren(tree,1) == nchildren(tree[1]) == tree[1].nchildren == 0
-                    @test_throws BoundsError(tree,0) nchildren(tree,0)
-                    @test_throws BoundsError(tree,2) nchildren(tree,2)
-                    @test ParallelUtilities.topnoderank(tree) == 1
-
-                    procs = 1:2
-                    tree = SegmentedOrderedBinaryTree(procs,Dict("host"=>procs))
-                    @test nchildren(tree,1) == nchildren(tree[1]) == tree[1].nchildren == 0
-                    @test nchildren(tree,2) == nchildren(tree[2]) == tree[2].nchildren == 1
-                    @test_throws BoundsError(tree,0) nchildren(tree,0)
-                    @test_throws BoundsError(tree,3) nchildren(tree,3)
-                    @test ParallelUtilities.topnoderank(tree) == 2
-
-                    procs = 1:8
-                    tree = SegmentedOrderedBinaryTree(procs,Dict("host"=>procs))
-                    @test nchildren(tree,1) == nchildren(tree[1]) == tree[1].nchildren == 0
-                    @test nchildren(tree,2) == nchildren(tree[2]) == tree[2].nchildren == 2
-                    @test nchildren(tree,3) == nchildren(tree[3]) == tree[3].nchildren == 0
-                    @test nchildren(tree,4) == nchildren(tree[4]) == tree[4].nchildren == 2
-                    @test nchildren(tree,5) == nchildren(tree[5]) == tree[5].nchildren == 0
-                    @test nchildren(tree,6) == nchildren(tree[6]) == tree[6].nchildren == 2
-                    @test nchildren(tree,7) == nchildren(tree[7]) == tree[7].nchildren == 0
-                    @test nchildren(tree,8) == nchildren(tree[8]) == tree[8].nchildren == 1
-                    @test_throws BoundsError(tree,0) nchildren(tree,0)
-                    @test_throws BoundsError(tree,9) nchildren(tree,9)
-                    @test ParallelUtilities.topnoderank(tree) == 8
-
-                    procs = 1:11
-                    tree = SegmentedOrderedBinaryTree(procs,Dict("host"=>procs))
-                    @test nchildren(tree,1) == nchildren(tree[1]) == tree[1].nchildren == 0
-                    @test nchildren(tree,2) == nchildren(tree[2]) == tree[2].nchildren == 2
-                    @test nchildren(tree,3) == nchildren(tree[3]) == tree[3].nchildren == 0
-                    @test nchildren(tree,4) == nchildren(tree[4]) == tree[4].nchildren == 2
-                    @test nchildren(tree,5) == nchildren(tree[5]) == tree[5].nchildren == 0
-                    @test nchildren(tree,6) == nchildren(tree[6]) == tree[6].nchildren == 2
-                    @test nchildren(tree,7) == nchildren(tree[7]) == tree[7].nchildren == 0
-                    @test nchildren(tree,8) == nchildren(tree[8]) == tree[8].nchildren == 2
-                    @test nchildren(tree,9) == nchildren(tree[9]) == tree[9].nchildren == 0
-                    @test nchildren(tree,10) == nchildren(tree[10]) == tree[10].nchildren == 2
-                    @test nchildren(tree,11) == nchildren(tree[11]) == tree[11].nchildren == 0
-                    @test_throws BoundsError(tree,0) nchildren(tree,0)
-                    @test_throws BoundsError(tree,12) nchildren(tree,12)
-                    @test ParallelUtilities.topnoderank(tree) == 8
-
-                    procs = 1:13
-                    tree = SegmentedOrderedBinaryTree(procs,Dict("host"=>procs))
-                    @test nchildren(tree,1) == nchildren(tree[1]) == tree[1].nchildren == 0
-                    @test nchildren(tree,2) == nchildren(tree[2]) == tree[2].nchildren == 2
-                    @test nchildren(tree,3) == nchildren(tree[3]) == tree[3].nchildren == 0
-                    @test nchildren(tree,4) == nchildren(tree[4]) == tree[4].nchildren == 2
-                    @test nchildren(tree,5) == nchildren(tree[5]) == tree[5].nchildren == 0
-                    @test nchildren(tree,6) == nchildren(tree[6]) == tree[6].nchildren == 2
-                    @test nchildren(tree,7) == nchildren(tree[7]) == tree[7].nchildren == 0
-                    @test nchildren(tree,8) == nchildren(tree[8]) == tree[8].nchildren == 2
-                    @test nchildren(tree,9) == nchildren(tree[9]) == tree[9].nchildren == 0
-                    @test nchildren(tree,10) == nchildren(tree[10]) == tree[10].nchildren == 2
-                    @test nchildren(tree,11) == nchildren(tree[11]) == tree[11].nchildren == 0
-                    @test nchildren(tree,12) == nchildren(tree[12]) == tree[12].nchildren == 2
-                    @test nchildren(tree,13) == nchildren(tree[13]) == tree[13].nchildren == 0
-                    @test_throws BoundsError(tree,0) nchildren(tree,0)
-                    @test_throws BoundsError(tree,14) nchildren(tree,14)
-                    @test ParallelUtilities.topnoderank(tree) == 8
-                end;
-            end;
-
-            @testsetwithinfo "multiple hosts" begin
-                @testset "length" begin
-                    procs = 1:2
-                    tree = SegmentedOrderedBinaryTree(procs,
-                        OrderedDict("host1"=>1:1,"host2"=>2:2))
-                    @test length(tree) == 2 + 1
-
-                    procs = 1:4
-                    tree = SegmentedOrderedBinaryTree(procs,
-                        OrderedDict("host1"=>1:2,"host2"=>3:4))
-
-                    @test length(tree) == 4 + 1
-
-                    procs = 1:12
-                    tree = SegmentedOrderedBinaryTree(procs,
-                        OrderedDict(
-                            "host1"=>1:3,"host2"=>4:6,
-                            "host3"=>7:9,"host4"=>10:12))
-
-                    @test length(tree) == 12 + 3 
-                end;
-
-                @testset "leafrankfoldedtree" begin
-                    treeflag = OrderedBinaryTree(1:1)
-                    @test leafrankfoldedtree(treeflag,5,1) == 1
-                    @test leafrankfoldedtree(treeflag,5,2) == 3
-                    @test leafrankfoldedtree(treeflag,5,3) == 5
-                    @test leafrankfoldedtree(treeflag,5,4) == 7
-                    @test leafrankfoldedtree(treeflag,5,5) == 9
-                end;
-
-                @testset "pid and parent" begin
-                    for imax = 2:100
-                        procs = 1:imax
-                        mid = div(imax,2)
-                        workersonhosts = OrderedDict{String,Vector{Int}}()
-                        workersonhosts["host1"] = procs[1:mid]
-                        workersonhosts["host2"] = procs[mid+1:end]
-                        tree = SegmentedOrderedBinaryTree(procs,workersonhosts)
-
-                        top = ParallelUtilities.topnoderank(tree)
-                        @test tree[top] == ParallelUtilities.topnode(tree)
-                        for (ind,rank) in enumerate(1:mid)
-                            node = tree[rank+1]
-                            parentnode = tree[parentnoderank(tree,rank+1)]
-                            @test parentnode.p == node.parent
-                            pnodes = workersonhosts["host1"]
-                            @test node.p == pnodes[ind]
-                            OBT = OrderedBinaryTree(pnodes)
-                            if ind == ParallelUtilities.topnoderank(OBT)
-                                # Special check for 2 hosts as 
-                                # there's only one node in the top tree
-                                @test node.parent == ParallelUtilities.topnode(tree.toptree).p
-                            else
-                                @test node.parent == pnodes[parentnoderank(OBT,ind)]
-                            end
-                        end
-                        for (ind,rank) in enumerate(mid+1:imax)
-                            node = tree[rank+1]
-                            parentnode = tree[parentnoderank(tree,rank+1)]
-                            @test parentnode.p == node.parent
-                            pnodes = workersonhosts["host2"]
-                            @test node.p == pnodes[ind]
-                            OBT = OrderedBinaryTree(pnodes)
-                            if ind == ParallelUtilities.topnoderank(OBT)
-                                # Special check for 2 hosts as 
-                                # there's only one node in the top tree
-                                @test node.parent == ParallelUtilities.topnode(tree.toptree).p
-                            else
-                                @test node.parent == pnodes[parentnoderank(OBT,ind)]
-                            end
-                        end
-                    end
-                end;
-
-                @testset "nchildren" begin
-                    procs = 1:2
-                    tree = SegmentedOrderedBinaryTree(procs,
-                        OrderedDict("host1"=>1:1,"host2"=>2:2))
-                    @test nchildren(tree,1) == nchildren(tree[1]) == tree[1].nchildren == 2
-                    @test nchildren(tree,2) == nchildren(tree[2]) == tree[2].nchildren == 0
-                    @test nchildren(tree,3) == nchildren(tree[3]) == tree[3].nchildren == 0
-                    @test_throws BoundsError(tree,0) nchildren(tree,0)
-                    @test_throws BoundsError(tree,4) nchildren(tree,4)
-
-                    procs = 1:12
-                    tree = SegmentedOrderedBinaryTree(procs,
-                        OrderedDict(
-                            "host1"=>1:3,"host2"=>4:6,
-                            "host3"=>7:9,"host4"=>10:12))
-                    @test nchildren(tree,1) == nchildren(tree[1]) == tree[1].nchildren == 2
-                    @test nchildren(tree,2) == nchildren(tree[2]) == tree[2].nchildren == 2
-                    @test nchildren(tree,3) == nchildren(tree[3]) == tree[3].nchildren == 2
-                    @test nchildren(tree,4) == nchildren(tree[4]) == tree[4].nchildren == 0
-                    @test nchildren(tree,5) == nchildren(tree[5]) == tree[5].nchildren == 2
-                    @test nchildren(tree,6) == nchildren(tree[6]) == tree[6].nchildren == 0
-                    @test nchildren(tree,7) == nchildren(tree[7]) == tree[7].nchildren == 0
-                    @test nchildren(tree,8) == nchildren(tree[8]) == tree[8].nchildren == 2
-                    @test nchildren(tree,9) == nchildren(tree[9]) == tree[9].nchildren == 0
-                    @test nchildren(tree,10) == nchildren(tree[10]) == tree[10].nchildren == 0
-                    @test nchildren(tree,11) == nchildren(tree[11]) == tree[11].nchildren == 2
-                    @test nchildren(tree,12) == nchildren(tree[12]) == tree[12].nchildren == 0
-                    @test nchildren(tree,13) == nchildren(tree[13]) == tree[13].nchildren == 0
-                    @test nchildren(tree,14) == nchildren(tree[14]) == tree[14].nchildren == 2
-                    @test nchildren(tree,15) == nchildren(tree[15]) == tree[15].nchildren == 0
-                    @test_throws BoundsError(tree,0) nchildren(tree,0)
-                    @test_throws BoundsError(tree,16) nchildren(tree,16)
-                end;
-            end;
-        end
-
-        @testsetwithinfo "unsegmentedtree" begin
-            @test ParallelUtilities.unsegmentedtree(SegmentedSequentialBinaryTree) == SequentialBinaryTree
-            @test ParallelUtilities.unsegmentedtree(SegmentedOrderedBinaryTree) == OrderedBinaryTree
-        end
-    end
-    
-    @testsetwithinfo "RemoteChannelContainer" begin
-    	@testsetwithinfo "Constructor" begin
-    	    rc = ParallelUtilities.RemoteChannelContainer{Int}(1,myid())
-	        @test rc.out.where == myid()
-	        @test rc.err.where == myid()
-	        @test eltype(rc) == Int
-
-            c = Channel(nworkers())
-            tasks = Vector{Task}(undef,nworkers())
-            @sync begin
-                for (ind,p) in enumerate(workers())
-                    tasks[ind] = @async begin
-                        try
-                            rc = ParallelUtilities.RemoteChannelContainer{Int}(1,p)
-                            res = (rc.out.where,rc.err.where,eltype(rc))
-                            put!(c,(ind,p,res,false))
-                        catch
-                            put!(c,(ind,p,(),true))
-                            rethrow()
-                        end
-                    end
-                end
-                for i = 1:nworkers()
-                    ind,p,res,err = take!(c)
-                    err && wait(tasks[ind])
-                    @test res == (p,p,Int)
-                    showworkernumber(i,nworkers())
-                end
-            end
-
-	        rc = ParallelUtilities.RemoteChannelContainer{Int}(1)
-	        @test rc.out.where == myid()
-	        @test rc.err.where == myid()
-	        @test eltype(rc) == Int
-
-	        rc = ParallelUtilities.RemoteChannelContainer(1,myid())
-	        @test rc.out.where == myid()
-	        @test rc.err.where == myid()
-	        @test eltype(rc) == Any
-
-            c = Channel(nworkers())
-            tasks = Vector{Task}(undef,nworkers())
-            @sync begin
-                for (ind,p) in enumerate(workers())
-                    tasks[ind] = @async begin
-                        try
-                            rc = ParallelUtilities.RemoteChannelContainer(1,p)
-                            res = (rc.out.where,rc.err.where,eltype(rc))
-                            put!(c,(ind,p,res,false))
-                        catch
-                            put!(c,(ind,p,(),true))
-                            rethrow()
-                        end
-                    end
-                end
-                for i = 1:nworkers()
-                    ind,p,res,err = take!(c)
-                    err && wait(tasks[ind])
-                    @test res == (p,p,Any)
-                    showworkernumber(i,nworkers())
-                end
-            end
-
-	        rc = ParallelUtilities.RemoteChannelContainer(1)
-	        @test rc.out.where == myid()
-	        @test rc.err.where == myid()
-	        @test eltype(rc) == Any
-    	end
-
-        @testsetwithinfo "finalize" begin
-            rc = ParallelUtilities.RemoteChannelContainer{Int}(1)
-            finalize(rc)
-            @test rc.out.where == 0
-            @test rc.err.where == 0
-        end
-
-        @testsetwithinfo "finalize_except_wherewhence" begin
-            rc = ParallelUtilities.RemoteChannelContainer{Int}(1)
-            ParallelUtilities.finalize_except_wherewhence(rc)
-            @test rc.out.where == myid()
-            @test rc.err.where == myid()
-
-            @testset "rc on where" begin
-                # Create on this processor
-                rc = ParallelUtilities.RemoteChannelContainer{Int}(1)
-                c = Channel(nworkers())
-                tasks = Vector{Task}(undef,nworkers())
-                @sync begin
-                    for (ind,p) in enumerate(workers())
-                        tasks[ind] = @async begin
-                            try
-                                rcoutw,rcerrw = @fetchfrom p begin 
-                                    ParallelUtilities.finalize_except_wherewhence(rc)
-                                    rc.out.where,rc.err.where
-                                end
-                                res = (rc.out.where,rc.err.where,rcoutw,rcerrw)
-                                put!(c,(ind,res,false))
-                            catch
-                                put!(c,(ind,(),true))
-                                rethrow()
-                            end
-                        end
-                    end
-                    for i = 1:nworkers()
-                        ind,res,err = take!(c)
-                        err && wait(tasks[ind])
-                        @test res == (myid(),myid(),0,0)
-                        showworkernumber(i,nworkers())
-                    end
-                end
-            end
-
-            @testset "rc on remote" begin
-                # Create elsewhere
-                p_rc = workers()[1]
-                rc = ParallelUtilities.RemoteChannelContainer{Int}(1,p_rc)
-                c = Channel(nprocs())
-                tasks = Vector{Task}(undef,nprocs())
-                @sync begin
-                    for (ind,p) in enumerate(procs())
-                        tasks[ind] = @async begin
-                            try
-                                rcw = @fetchfrom p begin
-                                    ParallelUtilities.finalize_except_wherewhence(rc)
-                                    (rc.out.where,rc.err.where)
-                                end
-                                put!(c,(ind,p,rcw,false))
-                            catch
-                                put!(c,(ind,p,(),true))
-                                rethrow()
-                            end
-                        end
-                    end
-                    for i = 1:nworkers()
-                        ind,p,res,err = take!(c)
-                        err && wait(tasks[ind])
-                        if p != myid() && p != p_rc
-                            @test res == (0,0)
-                        else
-                            @test res == (p_rc,p_rc)
-                        end
-                        showworkernumber(i,nworkers())
-                    end
-                end
-            end
-        end
-    end
-
-	@testsetwithinfo "BranchChannel" begin
-	    @testset "Constructor" begin
-	    	@testset "all channels supplied" begin
-    	        rc_self = RemoteChannelContainer{Int}(1)
-    	        rc_parent = RemoteChannelContainer{Int}(1)
-    	        rc_children = RemoteChannelContainer{Int}(1)
-    	        for n=0:2
-    	        	b = BranchChannel(1,rc_self,rc_parent,rc_children,n)
-    	        	@test b isa BranchChannel{Int,Int}
-    	        	@test b.p == 1
-    	        	@test b.selfchannels == rc_self
-    	        	@test b.parentchannels == rc_parent
-    	        	@test b.childrenchannels == rc_children
-                    @test nchildren(b) == b.nchildren == n
-    	        end
-    	        @test_throws ParallelUtilities.DomainError BranchChannel(1,rc_self,rc_parent,rc_children,3)
-	    	end
-
-	    	@testset "only parent channels supplied" begin
-	    		rc_parent = RemoteChannelContainer{Int}(1)
-	    		for n=0:2
-    	        	b = BranchChannel(1,Int,rc_parent,n)
-    	        	@test b isa BranchChannel{Int,Int}
-    	        	@test b.p == 1
-    	        	@test b.parentchannels == rc_parent
-    	        	@test b.selfchannels isa RemoteChannelContainer{Int}
-    	        	@test b.childrenchannels isa RemoteChannelContainer{Int}
-    	        	@test b.selfchannels.out.where == b.p
-    	        	@test b.selfchannels.err.where == b.p
-    	        	@test b.childrenchannels.out.where == b.p
-    	        	@test b.childrenchannels.err.where == b.p
-                    @test nchildren(b) == b.nchildren == n
-    	        end
-	        	@test_throws ParallelUtilities.DomainError BranchChannel(1,Int,rc_parent,3)
-	    	end
-
-	    	@testset "no channels supplied" begin
-	    		function testbranchchannel(b::BranchChannel{T,T},p,n) where {T}
-    	        	@test b.p == p
-    	        	@test b.parentchannels isa RemoteChannelContainer{T}
-    	        	@test b.selfchannels isa RemoteChannelContainer{T}
-    	        	@test b.childrenchannels isa RemoteChannelContainer{T}
-    	        	@test b.parentchannels.out.where == b.p
-    	        	@test b.parentchannels.err.where == b.p
-    	        	@test b.selfchannels.out.where == b.p
-    	        	@test b.selfchannels.err.where == b.p
-    	        	@test b.childrenchannels.out.where == b.p
-    	        	@test b.childrenchannels.err.where == b.p
-                    @test nchildren(b) == b.nchildren == n
-	    		end
-
-	    		p = workers()[1]
-	    	    for n=0:2
-    	        	b = BranchChannel{Int,Int}(p,n)
-    	        	testbranchchannel(b,p,n)
-    	        end
-	        	@test_throws ParallelUtilities.DomainError BranchChannel{Int,Int}(1,3)
-                @test_throws ParallelUtilities.DomainError BranchChannel{Int,Int}(1,-1)
-	    	end
-	    end
-
-	    @testset "finalize" begin
-	    	@testset "sameprocessor" begin
-    	        parentchannels = RemoteChannelContainer{Int}(1)
-    	        b = BranchChannel(1,Int,parentchannels,1)
-    	        finalize(b)
-    	        @test b.selfchannels.out.where == 0
-    	        @test b.selfchannels.err.where == 0
-	        	@test b.childrenchannels.out.where == 0
-	        	@test b.childrenchannels.err.where == 0
-	        	@test b.parentchannels.out.where == myid()
-	        	@test b.parentchannels.err.where == myid()
-	    	end
-	    	@testset "elsewhere" begin
-	    		p = workers()[1]
-    	        selfchannels = RemoteChannelContainer{Int}(1,p)
-    	        childrenchannels = RemoteChannelContainer{Int}(1,p)
-	    		
-	    		@testset "parent == whence == where == myid()" begin
-	    	        parentchannels = RemoteChannelContainer{Int}(1)
-	    	        b = BranchChannel(1,selfchannels,parentchannels,childrenchannels,1)
-	    	        self_w,parent_w,child_w = @fetchfrom p begin
-	    	        	finalize(b)
-	    	        	(b.selfchannels.out.where,b.selfchannels.err.where),
-	    	        	(b.parentchannels.out.where,b.parentchannels.err.where),
-	    	        	(b.childrenchannels.out.where,b.childrenchannels.err.where)
-	    	    	end
-	    	    	@test self_w == (0,0)
-		        	@test child_w == (0,0)
-		        	@test parent_w == (0,0)
-	    		end
-
-	    		@testset "(parent == where) != (whence == myid())" begin
-		        	parentchannels = RemoteChannelContainer{Int}(1,p)
-	    	        b = BranchChannel(1,selfchannels,parentchannels,childrenchannels,1)
-	    	        self_w,parent_w,child_w = @fetchfrom p begin
-	    	        	finalize(b)
-	    	        	(b.selfchannels.out.where,b.selfchannels.err.where),
-	    	        	(b.parentchannels.out.where,b.parentchannels.err.where),
-	    	        	(b.childrenchannels.out.where,b.childrenchannels.err.where)
-	    	    	end
-	    	    	@test self_w == (0,0)
-		        	@test child_w == (0,0)
-		        	@test parent_w == (p,p)
-		        end
-	    	end
-	    end
-
-	    @testset "createbranchchannels" begin
-	        function testbranches(T,tree)
-    	        branches = createbranchchannels(T,T,tree)
-    	        @test length(branches) == length(tree)
-                tnr = ParallelUtilities.topnoderank(tree)
-    	        for (rank,branch) in enumerate(branches)
-    	        	p = branch.p
-    	        	parentrank = parentnoderank(tree,rank)
-                    parentbranch = branches[parentrank]
-                    # Test channel host
-    	        	@test branch.selfchannels.out.where == p
-    	        	@test branch.selfchannels.err.where == p
-    	        	@test branch.childrenchannels.out.where == p
-    	        	@test branch.childrenchannels.err.where == p
-    	        	@test branch.parentchannels.out.where == parentbranch.p
-    	        	@test branch.parentchannels.err.where == parentbranch.p
-                    # Test link with parent
-                    # Holds for nodes other than the top node
-                    if rank != tnr
-                        @test branch.parentchannels.out === parentbranch.childrenchannels.out
-                        @test branch.parentchannels.err === parentbranch.childrenchannels.err
-                    end
-    	        end
-    	    end
-
-            @testset "SequentialBinaryTree" begin
-                tree = SequentialBinaryTree(workers());
-                for T in [Int,Any,Bool,Vector{Float64},Array{ComplexF64,2}]
-                    testbranches(T,tree)
-                end
-            end
-            @testset "OrderedBinaryTree" begin
-                tree = OrderedBinaryTree(workers())
-                for T in [Int,Any,Bool,Vector{Float64},Array{ComplexF64,2}]
-                    testbranches(T,tree)
-                end
-            end
-            @testset "SegmentedSequentialBinaryTree" begin
-                tree = SegmentedSequentialBinaryTree(workers())
-                for T in [Int,Any,Bool,Vector{Float64},Array{ComplexF64,2}]
-                    testbranches(T,tree)
-                end
-            end
-            @testset "SegmentedOrderedBinaryTree" begin
-                tree = SegmentedOrderedBinaryTree(workers())
-                for T in [Int,Any,Bool,Vector{Float64},Array{ComplexF64,2}]
-                    testbranches(T,tree)
-                end
-            end
-
-	        iterators = (1:nworkers()+1,)
-	        tree,branches = createbranchchannels(iterators,SequentialBinaryTree)
-	        @test eltype(first(branches).parentchannels) == Any
-            tree,branches = createbranchchannels(iterators,SegmentedSequentialBinaryTree)
-            @test eltype(first(branches).parentchannels) == Any
-            tree,branches = createbranchchannels(iterators,OrderedBinaryTree)
-            @test eltype(first(branches).parentchannels) == Any
-            tree,branches = createbranchchannels(Int,Int,iterators,SequentialBinaryTree)
-            @test eltype(first(branches).parentchannels) == Int
-            tree,branches = createbranchchannels(Int,Int,iterators,SegmentedSequentialBinaryTree)
-            @test eltype(first(branches).parentchannels) == Int
-	        tree,branches = createbranchchannels(Int,Int,iterators,OrderedBinaryTree)
-	        @test eltype(first(branches).parentchannels) == Int
-
-            # Make sure that all branches are defined
-            for T in [SequentialBinaryTree,
-                OrderedBinaryTree,
-                SegmentedSequentialBinaryTree,
-                SegmentedOrderedBinaryTree]
-
-                for nmax = 1:nworkers()
-                    iterators = (1:nmax,)
-                    tree,branches = createbranchchannels(iterators,T)
-                    for i in eachindex(branches)
-                        @test isassigned(branches,i)
-                    end
-                end
-            end
-
-            @testset "multiple hosts" begin
-                w = workers()
-                mid = div(nworkers(),2)
-                w1 = workers()[1:mid]
-                w2 = workers()[mid+1:end]
-                workersonhosts = OrderedDict("host1"=>w1,"host2"=>w2)
-
-                @testset "SegmentedSequentialBinaryTree" begin
-                    tree = SegmentedSequentialBinaryTree(w,workersonhosts)
-                    for T in [Int,Any]
-                        testbranches(T,tree)
-                    end
-                end
-                @testset "SegmentedOrderedBinaryTree" begin
-                    tree = SegmentedOrderedBinaryTree(w,workersonhosts)
-                    for T in [Int,Any]
-                        testbranches(T,tree)
-                    end
-                end
-            end
-	    end
-	end
-end;
-
-@testset "map reduce" begin
-    @testset "Sorted and Unsorted" begin
-        @test Sorted() isa Ordering
-        @test Unsorted() isa Ordering
-    end;
-
-    @testset "pval" begin
-        p = pval(2,3)
-        @test value(p) == 3
-        @test value(3) == 3
-        @test value(p) == value(value(p))
-
-        @test convert(pval{Any},p) == pval{Any}(2,3)
-        @test convert(pval{Float64},p) == pval{Any}(2,3.0)
-    end;
-
-    @testset "mapTreeNode" begin
-
-        @testset "maybepvalput!" begin
-            pipe = BranchChannel{Int,Int}(myid(),0)
-            rank = 1
-            maybepvalput!(pipe,rank,0)
-            @test isready(pipe.selfchannels.out)
-            @test take!(pipe.selfchannels.out) == 0
-
-            pipe = BranchChannel{pval,pval}(myid(),0)
-            maybepvalput!(pipe,rank,0)
-            @test isready(pipe.selfchannels.out)
-            @test take!(pipe.selfchannels.out) == pval(rank,0)
-
-            pipe = BranchChannel{pval{Int},pval{Int}}(myid(),0)
-            maybepvalput!(pipe,rank,0)
-            @test isready(pipe.selfchannels.out)
-            @test take!(pipe.selfchannels.out) == pval(rank,0)
-
-            T = Vector{ComplexF64}
-            pipe = BranchChannel{pval{T},pval{T}}(myid(),1)
-
-            val = ones(1).*im
-            maybepvalput!(pipe,rank,val)
-            @test isready(pipe.selfchannels.out)
-            @test take!(pipe.selfchannels.out) == pval(rank,ComplexF64[im])
-
-            val = ones(1)
-            maybepvalput!(pipe,rank,val)
-            @test isready(pipe.selfchannels.out)
-            @test take!(pipe.selfchannels.out) == pval(rank,ComplexF64[1])
-
-            T = Vector{Float64}
-            pipe = BranchChannel{pval{T},pval{T}}(myid(),1)
-
-            val = ones(1)
-            maybepvalput!(pipe,rank,val)
-            @test isready(pipe.selfchannels.out)
-            @test take!(pipe.selfchannels.out) == pval(rank,Float64[1])
-
-            val = ones(Int,1)
-            maybepvalput!(pipe,rank,val)
-            @test isready(pipe.selfchannels.out)
-            @test take!(pipe.selfchannels.out) == pval(rank,Float64[1])
-
-            pipe = BranchChannel{pval,pval}(myid(),1)
-
-            val = ones(1)
-            maybepvalput!(pipe,rank,val)
-            @test isready(pipe.selfchannels.out)
-            @test take!(pipe.selfchannels.out) == pval(rank,Float64[1])
-
-            val = ones(Int,1)
-            maybepvalput!(pipe,rank,val)
-            @test isready(pipe.selfchannels.out)
-            @test take!(pipe.selfchannels.out) == pval(rank,Int[1])
-        end
-
-        function test_on_pipe(fn,iterator,pipe,result_expected,progressrc=nothing)
-            rank = 1
-            @test_throws ErrorException mapTreeNode(x->error("fmap"),iterator,rank,pipe,progressrc)
-            @test !isready(pipe.selfchannels.out) # should not have any result as there was an error
-            @test isready(pipe.selfchannels.err)
-            @test take!(pipe.selfchannels.err) # error flag should be true
-            @test !isready(pipe.selfchannels.err) # should not hold anything now
-            @test !isready(pipe.parentchannels.out)
-            @test !isready(pipe.parentchannels.err)
-            @test !isready(pipe.childrenchannels.out)
-            @test !isready(pipe.childrenchannels.err)
-            if progressrc isa RemoteChannel
-                @test isready(progressrc)
-                @test take!(progressrc) == (false,false,rank)
-            end
-
-            mapTreeNode(fn,iterator,rank,pipe,progressrc)
-            @test isready(pipe.selfchannels.err)
-            @test !take!(pipe.selfchannels.err) # error flag should be false
-            @test !isready(pipe.selfchannels.err)
-            @test isready(pipe.selfchannels.out)
-            @test take!(pipe.selfchannels.out) == result_expected
-            @test !isready(pipe.selfchannels.out)
-            @test !isready(pipe.parentchannels.out)
-            @test !isready(pipe.parentchannels.err)
-            @test !isready(pipe.childrenchannels.out)
-            @test !isready(pipe.childrenchannels.err)
-            if progressrc isa RemoteChannel
-                @test isready(progressrc)
-                @test take!(progressrc) == (true,false,rank)
-            end
-        end
-
-        @testset "range" begin
-            iterator = 1:10
-            
-            pipe = BranchChannel{Int,Int}(myid(),0)
-            test_on_pipe(sum,iterator,pipe,sum(iterator))
-
-            pipe = BranchChannel{Int,Int}(myid(),0)
-            progress = RemoteChannel(()->Channel{Tuple{Bool,Bool,Int}}(1))
-            test_on_pipe(sum,iterator,pipe,sum(iterator),progress)
-        end
-        
-        @testset "ProductSplit" begin
-            iterators = (1:10,)
-            ps = ProductSplit(iterators,1,1)
-
-            pipe = BranchChannel{Int,Int}(myid(),0)
-            test_on_pipe(x->sum(y[1] for y in x),ps,pipe,sum(iterators[1]))
-
-            pipe = BranchChannel{Int,Int}(myid(),1)
-            test_on_pipe(x->sum(y[1] for y in x),ps,pipe,sum(iterators[1]))
-
-            pipe = BranchChannel{Int,Int}(myid(),2)
-            test_on_pipe(x->sum(y[1] for y in x),ps,pipe,sum(iterators[1]))
-        end
-
-        @testset "progress" begin
-            @test isnothing(ParallelUtilities.indicatemapprogress!(nothing,1))
-            rettype = Tuple{Bool,Bool,Int}
-            progress = RemoteChannel(()->Channel{rettype}(1))
-            ParallelUtilities.indicatemapprogress!(progress,10)
-            @test take!(progress) == (true,false,10)
-        end
-    end;
-
-    @testset "reduce" begin
-
-        # Leaves just push results to the parent
-        # reduced value at a leaf is simply whatever is stored in the local output channel
-        @testset "at a leaf" begin
-            # These do not check for errors
-            result = 1
-            rank = 1
-            val = pval(rank,result)
-
-            pipe = BranchChannel{typeof(val),typeof(val)}(myid(),0)
-            put!(pipe.selfchannels.out,val)
-            @test ParallelUtilities.reducedvalue(sum,rank,pipe,Sorted()) == val
-
-            pipe = BranchChannel{typeof(result),typeof(result)}(myid(),0)
-            put!(pipe.selfchannels.out,result)
-            @test ParallelUtilities.reducedvalue(sum,rank,pipe,Unsorted()) == result
-        end;
-
-        # # Values are collected at the intermediate nodes
-        @testset "at parent nodes" begin
-
-            # Put some known values on the self and children channels
-            @everywhere begin
-
-            function putselfchildren!(pipe::BranchChannel,ord::Ordering,rank=1,
-                args...)
-                putselfchildren!(pipe,ord,
-                    rank > 0 ? SubTreeNode(rank) : TopTreeNode(rank),
-                    args...)
-            end
-            function putselfchildren!(pipe::BranchChannel,::Unsorted,::TopTreeNode)
-                for i=1:nchildren(pipe)
-                    put!(pipe.childrenchannels.out,i)
-                    put!(pipe.childrenchannels.err,false)
-                end
-            end
-            function putselfchildren!(pipe::BranchChannel,::Unsorted,::SubTreeNode)
-                put!(pipe.selfchannels.out,0)
-                put!(pipe.selfchannels.err,false)
-                for i=1:nchildren(pipe)
-                    put!(pipe.childrenchannels.out,i)
-                    put!(pipe.childrenchannels.err,false)
-                end
-            end
-            function putselfchildren!(pipe::BranchChannel{<:pval},::Sorted,
-                node::SubTreeNode,leftchildrank=1,rightchildrank=3)
-
-                selfrank = node.rank
-                put!(pipe.selfchannels.out,pval(selfrank,2))
-                put!(pipe.selfchannels.err,false)
-                N = nchildren(pipe)
-                
-                if N > 0
-                    # left child
-                    put!(pipe.childrenchannels.out,pval(leftchildrank,1))
-                    put!(pipe.childrenchannels.err,false)
-                end
-
-                if N > 1
-                    # right child
-                    put!(pipe.childrenchannels.out,pval(rightchildrank,3))
-                    put!(pipe.childrenchannels.err,false)
-                end
-            end
-            function putselfchildren!(pipe::BranchChannel{<:pval},::Sorted,
-                node::TopTreeNode,leftchildrank=1,rightchildrank=3)
-
-                selfrank = node.rank
-                N = nchildren(pipe)
-                
-                if N > 0
-                    # left child
-                    put!(pipe.childrenchannels.out,pval(leftchildrank,1))
-                    put!(pipe.childrenchannels.err,false)
-                end
-
-                if N > 1
-                    # right child
-                    put!(pipe.childrenchannels.out,pval(rightchildrank,3))
-                    put!(pipe.childrenchannels.err,false)
-                end
-            end
-
-            function clearerrors!(pipe::BranchChannel,rank=1)
-                clearerrors!(pipe,
-                    rank > 0 ? SubTreeNode(rank) : TopTreeNode(rank))
-            end
-            function clearerrors!(pipe::BranchChannel,node::SubTreeNode)
-                take!(pipe.selfchannels.err)
-                for i=1:nchildren(pipe)
-                    take!(pipe.childrenchannels.err)
-                end
-            end
-            function clearerrors!(pipe::BranchChannel,node::TopTreeNode)
-                for i=1:nchildren(pipe)
-                    take!(pipe.childrenchannels.err)
-                end
-            end
-
-            end # everwhere
-
-            @testset "reducedvalue" begin
-
-                function testreduction(freduce::Function,pipe::BranchChannel,
-                    ifsorted::Unsorted,res_exp,rank=2)
-
-                    p = pipe.p
-
-                    try
-                        putselfchildren!(pipe,ifsorted,rank)
-                        @test value(reducedvalue(freduce,rank,pipe,ifsorted)) == res_exp
-                        clearerrors!(pipe,rank)
-                        
-                        @fetchfrom p putselfchildren!(pipe,ifsorted,rank)
-                        @test value(@fetchfrom p reducedvalue(freduce,rank,pipe,ifsorted)) == res_exp
-                        clearerrors!(pipe,rank)
-                        
-                        @fetchfrom p putselfchildren!(pipe,ifsorted,rank)
-                        @test value(reducedvalue(freduce,rank,pipe,ifsorted)) == res_exp
-                        clearerrors!(pipe,rank)
-                        
-                        putselfchildren!(pipe,ifsorted,rank)
-                        @test value(@fetchfrom p reducedvalue(freduce,rank,pipe,ifsorted)) == res_exp
-                        clearerrors!(pipe,rank)
-                    catch
-                        rethrow()
-                    end
-                end
-
-                function testreduction(freduce::Function,pipe::BranchChannel,
-                    ifsorted::Sorted,res_exp,
-                    selfrank=2,leftchildrank=1,rightchildrank=3)
-
-                    p = pipe.p
-                    ranks = (selfrank,leftchildrank,rightchildrank)
-
-                    try
-                        putselfchildren!(pipe,ifsorted,ranks...)
-                        @test value(reducedvalue(freduce,selfrank,pipe,ifsorted)) == res_exp
-                        clearerrors!(pipe,selfrank)
-                        
-                        @fetchfrom p putselfchildren!(pipe,ifsorted,ranks...)
-                        @test value(@fetchfrom p reducedvalue(freduce,selfrank,pipe,ifsorted)) == res_exp
-                        clearerrors!(pipe,selfrank)
-                        
-                        @fetchfrom p putselfchildren!(pipe,ifsorted,ranks...)
-                        @test value(reducedvalue(freduce,selfrank,pipe,ifsorted)) == res_exp
-                        clearerrors!(pipe,selfrank)
-                        
-                        putselfchildren!(pipe,ifsorted,ranks...)
-                        @test value(@fetchfrom p reducedvalue(freduce,selfrank,pipe,ifsorted)) == res_exp
-                        clearerrors!(pipe,selfrank)
-                    catch
-                        rethrow()
-                    end
-                end
-
-                for n = 1:2
-                    @testset "Unsorted" begin
-                        pipe = BranchChannel{Int,Int}(myid(),n)
-                        res_exp = sum(0:n)
-                        testreduction(sum,pipe,Unsorted(),res_exp,2)
-                        testreduction(sum,pipe,Unsorted(),res_exp,SubTreeNode(2))
-
-                        @testset "toptree" begin
-                            testreduction(sum,pipe,Unsorted(),res_exp,0)
-                            testreduction(sum,pipe,Unsorted(),res_exp,TopTreeNode(0))
-                        end
-                    end
-                    @testset "Sorted" begin
-                        pipe = BranchChannel{pval,pval}(myid(),n)
-                        res_exp = collect(1:n+1)
-                        testreduction(x->vcat(x...),pipe,Sorted(),res_exp)
-                        testreduction(x->vcat(x...),pipe,Sorted(),res_exp,SubTreeNode(2))
-                
-                        pipe = BranchChannel{pval,pval}(myid(),n)
-                        res_exp = sum(1:n+1)
-                        testreduction(sum,pipe,Sorted(),res_exp)
-                        testreduction(sum,pipe,Sorted(),res_exp,SubTreeNode(2))
-
-                        @testset "toptree" begin
-                            pipe = BranchChannel{pval,pval}(myid(),n)
-                            res_exp = n == 1 ? [1] : [1,3]
-                            testreduction(x->vcat(x...),pipe,Sorted(),res_exp,0,1,2)
-                            testreduction(x->vcat(x...),pipe,Sorted(),res_exp,TopTreeNode(0),1,2)
-                    
-                            pipe = BranchChannel{pval,pval}(myid(),n)
-                            res_exp = n == 1 ? 1 : 1+3
-                            testreduction(sum,pipe,Sorted(),res_exp,0,1,2)
-                            testreduction(sum,pipe,Sorted(),res_exp,TopTreeNode(0),1,2)
-                        end
-                    end
-                end
-            end
-
-            @testset "reduceTreeNode" begin
-
-                @everywhere begin
-                
-                function testfinalized(rank,pipe)
-                    rank > 0 ? 
-                        testfinalized(SubTreeNode(rank),pipe) : 
-                        testfinalized(TopTreeNode(rank),pipe)
-                end
-
-                function testfinalized(::SubTreeNode,pipe)
-                    @test pipe.selfchannels.out.where == 0
-                    @test pipe.selfchannels.err.where == 0
-                    @test pipe.childrenchannels.out.where == 0
-                    @test pipe.childrenchannels.err.where == 0
-                end
-                function testfinalized(::TopTreeNode,pipe)
-                    @test pipe.childrenchannels.out.where == 0
-                    @test pipe.childrenchannels.err.where == 0
-                end
-
-                strippedrank(t::ParallelUtilities.ReductionNode) = t.rank
-                strippedrank(t::Integer) = t
-
-                function testreduction(freduce::Function,pipe::BranchChannel,
-                    ifsorted::Ordering,res_exp,rank,
-                    progressrc=nothing,args...)
-
-                    @test !isready(pipe.parentchannels.out)
-                    @test !isready(pipe.parentchannels.err)
-
-                    wait(@spawnat(pipe.p,
-                        putselfchildren!(pipe,ifsorted,rank,args...) ) )
-                    reduceTreeNode(freduce,rank,pipe,ifsorted,progressrc)
-
-                    @test isready(pipe.parentchannels.out)
-                    @test isready(pipe.parentchannels.err)
-                    @test !take!(pipe.parentchannels.err) # there should be no error
-                    @test value(take!(pipe.parentchannels.out)) == res_exp
-
-                    if progressrc isa RemoteChannel
-                        @test isready(progressrc)
-                        @test take!(progressrc) == (false,true,strippedrank(rank))
-                    end
-
-                    # The pipe should be finalized at this point
-                    testfinalized(rank,pipe)
-                end
-
-                end # everywhere
-
-                for n = 1:2
-                    @testset "Unsorted" begin
-                        pipe = BranchChannel{Int,Int}(myid(),n)
-                        res_exp = sum(0:n)
-                        testreduction(sum,pipe,Unsorted(),res_exp,2)
-                        
-                        pipe = BranchChannel{Int,Int}(myid(),n)
-                        testreduction(sum,pipe,Unsorted(),res_exp,SubTreeNode(2))
-
-                        pipe = BranchChannel{Int,Int}(myid(),n)
-                        testreduction(sum,pipe,Unsorted(),res_exp,TopTreeNode(0))
-
-                        pipe = BranchChannel{Int,Int}(myid(),n)
-                        progress = RemoteChannel(()->Channel{Tuple{Bool,Bool,Int}}(1))
-                        @test_throws ErrorException testreduction(
-                            x->error("fred"),pipe,Unsorted(),
-                            res_exp,TopTreeNode(0),progress)
-                        @test isready(progress)
-                        @test take!(progress) == (false,false,0)
-
-                        pipe = BranchChannel{Int,Int}(myid(),n)
-                        progress = RemoteChannel(()->Channel{Tuple{Bool,Bool,Int}}(1))
-                        testreduction(sum,pipe,Unsorted(),
-                            res_exp,TopTreeNode(0),progress)
-
-                        rc_parent = RemoteChannelContainer{Int}(1)
-                        p = workers()[1]
-                        
-                        pipe = BranchChannel(p,Int,rc_parent,n)
-                        testreduction(sum,pipe,Unsorted(),res_exp,2)
-                        
-                        pipe = BranchChannel(p,Int,rc_parent,n)
-                        testreduction(sum,pipe,Unsorted(),res_exp,SubTreeNode(2))
-                    end
-                    @testset "Sorted" begin
-                        @testset "SubTreeNode" begin
-                            res_exp = collect(1:n+1)
-                            pipe = BranchChannel{pval,pval}(myid(),n)
-                            testreduction(x->vcat(x...),pipe,Sorted(),res_exp,2)
-                            
-                            pipe = BranchChannel{pval,pval}(myid(),n)
-                            testreduction(x->vcat(x...),pipe,Sorted(),res_exp,SubTreeNode(2))
-                            
-                            rc_parent = RemoteChannelContainer{pval}(myid(),1)
-                            p = workers()[1]
-                            
-                            pipe = BranchChannel(p,pval,rc_parent,n)
-                            testreduction(x->vcat(x...),pipe,Sorted(),res_exp,2)
-
-                            pipe = BranchChannel(p,pval,rc_parent,n)
-                            testreduction(x->vcat(x...),pipe,Sorted(),res_exp,SubTreeNode(2))
-                    
-                            res_exp = sum(1:n+1)
-                            pipe = BranchChannel{pval,pval}(myid(),n)
-                            testreduction(sum,pipe,Sorted(),res_exp,2)
-                            
-                            pipe = BranchChannel{pval,pval}(myid(),n)
-                            testreduction(sum,pipe,Sorted(),res_exp,SubTreeNode(2))
-
-                            rc_parent = RemoteChannelContainer{pval}(1)
-                            p = workers()[1]
-                            
-                            pipe = BranchChannel(p,pval,rc_parent,n)
-                            testreduction(sum,pipe,Sorted(),res_exp,2)
-                            
-                            pipe = BranchChannel(p,pval,rc_parent,n)
-                            testreduction(sum,pipe,Sorted(),res_exp,SubTreeNode(2))
-                        end
-                        
-                        @testset "TopTreeNode" begin
-                            res_exp = n == 1 ? [1] : [1,3]
-                            
-                            pipe = BranchChannel{pval,pval}(myid(),n)
-                            testreduction(x->vcat(x...),pipe,Sorted(),res_exp,0)
-
-                            pipe = BranchChannel{pval,pval}(myid(),n)
-                            testreduction(x->vcat(x...),pipe,Sorted(),res_exp,TopTreeNode(0))
-                        end
-                    end
-                end
-
-                # The top tree must have children by definition
-                pipe = BranchChannel{Int,Int}(myid(),0)
-                putselfchildren!(pipe,Unsorted(),0)
-                @test_throws ErrorException reducedvalue(sum,0,pipe,Unsorted())
-                clearerrors!(pipe,0)
-            end
-        end;
-
-        @testset "progress" begin
-            @test isnothing(ParallelUtilities.indicatereduceprogress!(nothing,1))
-            rettype = Tuple{Bool,Bool,Int}
-            progress = RemoteChannel(()->Channel{rettype}(1))
-            ParallelUtilities.indicatereduceprogress!(progress,10)
-            @test take!(progress) == (false,true,10)
-
-            @test isnothing(ParallelUtilities.indicatefailure!(nothing,1))
-            ParallelUtilities.indicatefailure!(progress,10)
-            @test take!(progress) == (false,false,10)
-        end
-    end;
-end
-
-@testset "reduction functions" begin
-    @testset "sumcat_aligned" begin
-        @testset "one array" begin
-            a1 = ones(3:4)
-            a12 = ParallelUtilities.sumcat_aligned(a1, dims = 1)
-            @test a12 == a1
-            @test_throws Exception ParallelUtilities.sumcat_aligned(a1, dims = 2)
-
-            a1 = ones(3:4, 4:5)
-            a12 = ParallelUtilities.sumcat_aligned(a1, dims = 1)
-            @test a12 == a1
-            a12 = ParallelUtilities.sumcat_aligned(a1, dims = 2)
-            @test a12 == a1
-            @test_throws Exception ParallelUtilities.sumcat_aligned(a1, dims = 3)
-
-            a1 = zeros(2:4, 2:4)
-            a1[2:3, 2:3] .+= 1
-            a1[3:4, 3:4] .+= 1
-
-            a12 = ParallelUtilities.sumcat_aligned(ones(2:3,2:3), ones(3:4,3:4), dims=(1,2))
-            @test a12 == a1
-
-            a1 = zeros(2:5, 2:5)
-            a1[2:3, 2:3] .+= 1
-            a1[4:5, 4:5] .+= 1
-
-            a12 = ParallelUtilities.sumcat_aligned(ones(2:3,2:3), ones(4:5,4:5), dims=(1,2))
-            @test a12 == a1
-        end
-
-        @testset "two arrays" begin
-
-            function test(a1ax, a2ax, a12ax, dims)
-                a1 = ones(a1ax)
-                a2 = ones(a2ax)
-                a12 = ParallelUtilities.sumcat_aligned(a1, a2, dims = dims)
-                a12exp = zeros(a12ax)
-                a12exp[a1ax...] .+= 1
-                a12exp[a2ax...] .+= 1
-                @test a12 == a12exp
-            end
-
-            test((3:4,), (6:8,), (3:8,), 1)
-
-            test((3:4, 3:4), (6:8, 3:4), (3:8, 3:4), 1)
-
-            test((3:4, 3:4), (3:4, 6:8), (3:4, 3:8), 2)
-
-            test((3:4, 3:4), (4:8, 4:8), (3:8, 3:8), (1,2))
-
-            test((3:4, 3:4), (6:8, 6:8), (3:8, 3:8), (1,2))
-
-            @test_throws DimensionMismatch test((3:4, 3:4), (6:8, 3:5), (3:8, 3:5), 1)
-            @test_throws ArgumentError test((3:4, 3:4), (3:4, 3:4), (3:4, 3:4), 3)
-        end
-    end
-    @testset "sumvcat_aligned" begin
-        @testset "one array" begin
-            a1 = ones(3:4)
-            a12 = ParallelUtilities.sumvcat_aligned(a1)
-            @test a12 == a1
-
-            a1 = ones(3:4, 4:5)
-            a12 = ParallelUtilities.sumvcat_aligned(a1)
-            @test a12 == a1
-
-            @test_throws ArgumentError ParallelUtilities.sumvcat_aligned(ones())
-        end
-
-        @testset "two arrays" begin
-
-            function test(a1ax, a2ax, a12ax)
-                a1 = ones(a1ax)
-                a2 = ones(a2ax)
-                a12 = ParallelUtilities.sumvcat_aligned(a1, a2)
-                a12exp = zeros(a12ax)
-                a12exp[a1ax...] .+= 1
-                a12exp[a2ax...] .+= 1
-                @test a12 == a12exp
-            end
-
-            test((3:4,), (6:8,), (3:8,))
-
-            test((3:4, 3:4), (6:8, 3:4), (3:8, 3:4))
-
-            test((3:4, 3:4), (4:8, 3:4), (3:8, 3:4))
-
-            @test_throws ArgumentError test((), (), ())
-
-            @test_throws DimensionMismatch test((3:4, 3:4), (6:8, 3:5), (3:8, 3:5))
-        end
-    end
-    @testset "sumhcat_aligned" begin
-        @testset "one array" begin
-            a1 = ones(3:4)
-            @test_throws Exception ParallelUtilities.sumhcat_aligned(a1)
-
-            a1 = ones(3:4, 4:5)
-            a12 = ParallelUtilities.sumhcat_aligned(a1)
-            @test a12 == a1
-
-            @test_throws ArgumentError ParallelUtilities.sumhcat_aligned(ones(1:2))
-        end
-
-        @testset "two arrays" begin
-
-            function test(a1ax, a2ax, a12ax)
-                a1 = ones(a1ax)
-                a2 = ones(a2ax)
-                a12 = ParallelUtilities.sumhcat_aligned(a1, a2)
-                a12exp = zeros(a12ax)
-                a12exp[a1ax...] .+= 1
-                a12exp[a2ax...] .+= 1
-                @test a12 == a12exp
-            end
-
-            test((3:4, 3:4), (3:4, 6:8), (3:4, 3:8))
-
-            test((3:4, 3:4), (3:4, 4:8), (3:4, 3:8))
-
-            @test_throws DimensionMismatch test((3:4, 3:4), (3:5, 6:8), (3:5, 3:8))
-            @test_throws ArgumentError test((3:4,), (3:4,), (3:4))
-        end
-    end
-end
-
-@testset "pmapbatch and pmapreduce" begin
-
-	@testsetwithinfo "pmapbatch" begin
-		@testsetwithinfo "batch" begin
-			@testset "comparison with map" begin
-				iterable = 1:nworkers()
-			    res = pmapbatch(x->myid(),iterable)
-			    @test res == workers()
-			    res = pmapbatch(x->myid(),(iterable,))
-			    @test res == workers()
-			    res = pmapbatch(x->myid(),(iterable,1:1))
-			    @test res == workers()
-			    res = pmapbatch(x->myid(),iterable,num_workers=1)
-			    @test res == workers()[1:1]
-
-			    iterable = 1:nworkers()-1
-			    res = pmapbatch(x->myid(),iterable)
-			    @test res == workersactive(iterable)
-
-			    iterable = 1:nworkers()
-			    res = pmapbatch(identity,iterable)
-			    resexp = [ProductSplit((iterable,),nworkersactive(iterable),p) for p=1:nworkersactive(iterable)]
-				@test res == resexp
-			    
-			    iterable = 1:nworkers()
-			    res = pmapbatch(identity,iterable)
-			    resexp = [ProductSplit((iterable,),nworkers(),p) for p=1:nworkers()]
-				@test res == resexp
-			    
-			    iterable = 1:2nworkers()
-			    res = pmapbatch(identity,iterable)
-			    resexp = [ProductSplit((iterable,),nworkersactive(iterable),p) for p=1:nworkersactive(iterable)]
-				@test res == resexp			    
-			end
-
-            @testset "multiple iterators" begin
-                res = pmapbatch(tup->[((x,y), x+y) for (x,y) in tup], (1:2,1:2))
-                @test res == [((1, 1), 2), ((2, 1), 3), ((1, 2), 3), ((2, 2), 4)]
-
-                xrange, yrange, zrange = 1:2, 2:3, 3:4
-                p = pmapbatch(ps->[sum(tup) for tup in ps], (xrange,yrange,zrange));
-                @test p == [6,7,7,8,7,8,8,9]
-            end
-
-            @testset "return type specified" begin
-                iterable = 1:nworkers()
-                @test pmapbatch(ParallelUtilities.workerrank, Int, iterable) == iterable
-                @test pmapbatch(ParallelUtilities.workerrank, Float64, iterable) == Float64.(iterable)
-                @test_throws Exception pmapbatch(ParallelUtilities.workerrank, Vector{Int}, iterable)
-            end
-
-			@testset "errors" begin
-			    @test_throws Exception pmapbatch(x->throw(BoundsError()),1:10)
-			end
-		end
-		
-		@testsetwithinfo "elementwise" begin
-			@testset "comparison with map" begin
-			    iterable = 1:nworkers()
-			    res = pmapbatch_elementwise(identity,iterable)
-			    @test res == iterable
-
-                res = pmapbatch_elementwise(identity,iterable,num_workers=1)
-                @test res == iterable
-
-				iterable = 1:20
-			    res = pmapbatch_elementwise(x->x^2,iterable)
-			    @test res == iterable.^2
-			end
-
-            @testset "multiple iterators" begin
-                xrange, yrange, zrange = 1:2, 2:3, 3:4
-                p = pmapbatch_elementwise((x,y,z)->x+y+z, (xrange,yrange,zrange));
-                @test p == [6,7,7,8,7,8,8,9]
-            end
-
-		    @testset "errors" begin
-			    @test_throws Exception pmapbatch_elementwise(x->throw(BoundsError()),1:10)
-			end
-		end
-	end;
-
-	@testsetwithinfo "pmapsum" begin
-		@testsetwithinfo "batch" begin
-		    @testset "rank" begin
-                res_exp = sum(1:nworkers())
-                @testset "without progress" begin
-                    res = pmapsum(x->x[1][1],Int,1:nworkers())
-    			    @test res == res_exp
-                    res = pmapsum(x->x[1][1],1:nworkers())
-                    @test res == res_exp
-                end
-                @testset "with progress" begin
-                    res = @test_deprecated pmapsum(x->x[1][1],Int,1:nworkers(), showprogress=true)
-                    @test res == res_exp
-                    res = @test_deprecated pmapsum(x->x[1][1],1:nworkers(), showprogress=true)
-                    @test res == res_exp
-                end
-			    @test pmapsum(x->x[1][1], Int, (1:nworkers(),)) == res_exp
-                @test pmapsum(x->x[1][1], (1:nworkers(),)) == res_exp
-                @test pmapsum(x->x[1][1], Int, (1:nworkers(), 1:1)) == res_exp
-			    @test pmapsum(x->x[1][1], (1:nworkers(), 1:1)) == res_exp
-			    @test pmapsum(x->myid(), 1:nworkers()) == sum(workers())
-		    end
-		    
-		    @testset "one iterator" begin
-			    rng = 1:100
-			    @test pmapsum(x->sum(y[1] for y in x),rng) == sum(rng)
-			    @test pmapsum(x->sum(y[1] for y in x),(rng,)) == sum(rng)
-		    end
-
-		    @testset "array" begin
-		    	@test pmapsum(x->ones(2),1:nworkers()) == ones(2).*nworkers()
-		    end
-
-		    @testset "stepped iterator" begin
-			    rng = 1:5:100
-			    @test pmapsum(x->sum(y[1] for y in x),rng) == sum(rng)
-		    end
-
-		    @testset "two iterators" begin
-			    iters = (1:100,1:2)
-			    @test pmapsum(x->sum(y[1] for y in x),iters) == sum(iters[1])*length(iters[2])
-		    end
-		    
-		    @testsetwithinfo "run elsewhere" begin
-		    	res_exp = sum(workers())
-                c = Channel{Tuple{Int,Int,Bool}}(nworkers())
-                tasks = Vector{Task}(undef,nworkers())
-                @sync begin 
-                    for (ind,p) in enumerate(workers())
-                        tasks[ind] = @async begin
-                            try
-                                res = @fetchfrom p pmapsum(x->myid(),1:nworkers())
-                                put!(c,(ind,res,false))
-                            catch
-                                put!(c,(ind,0,true))
-                                rethrow()
-                            end
-                        end
-                    end
-                    for i = 1:nworkers()
-                        ind,res,err = take!(c)
-                        err && wait(tasks[ind])
-                        @test res == res_exp
-                        showworkernumber(i,nworkers())
-                    end
-                end
-		    end;
-
-		    @testset "errors" begin
-		        @test_throws Exception pmapsum(x->error("map"),1:10)
-                @test_throws Exception pmapsum(x->fmap(x),1:10)
-		    end
-		end
-
-		@testsetwithinfo "elementwise" begin
-			@testset "comparison with map" begin
-			    iterable = 1:100
-                @testset "without progress" begin
-                    res = pmapsum_elementwise(identity,iterable)
-                    @test res == sum(iterable)
-                end
-                @testset "with progress" begin
-                    res = @test_deprecated pmapsum_elementwise(identity,iterable, showprogress=true)
-                    @test res == sum(iterable) 
-                end
-                res = pmapsum_elementwise(identity,(iterable,))
-                @test res == sum(iterable)
-			    res = pmapsum_elementwise(identity,Int,iterable)
-			    @test res == sum(iterable)
-			    res = pmapsum_elementwise(identity,Int,(iterable,))
-			    @test res == sum(iterable)
-
-			    iterable = 1:100
-			    res = pmapsum_elementwise(x->x^2,iterable)
-			    @test res == sum(x->x^2,iterable)
-			    @test res == pmapsum(plist->sum(x[1]^2 for x in plist),iterable)
-			end
-
-			@testset "run elsewhere" begin
-				iterable = 1:100
-				res_exp = sum(iterable)
-                c = Channel{Tuple{Int,Int,Bool}}(nworkers())
-                tasks = Vector{Task}(undef,nworkers())
-                @sync begin 
-                    for (ind,p) in enumerate(workers())
-                        tasks[ind] = @async begin
-                            try
-                                res = @fetchfrom p pmapsum_elementwise(identity,iterable)
-                                put!(c,(ind,res,false))
-                            catch
-                                put!(c,(ind,0,true))
-                                rethrow()
-                            end
-                        end
-                    end
-                    for i = 1:nworkers()
-                        ind,res,err = take!(c)
-                        err && wait(tasks[ind])
-                        @test res == res_exp
-                        showworkernumber(i,nworkers())
-                    end
-                end
-		    end;
-
-		    @testset "errors" begin
-		        @test_throws Exception pmapsum_elementwise(x->error("hi"),1:10)
-		    end
-		end
-
-        @testset "type coercion" begin
-            @test_throws Exception pmapsum(x->[1.1],Vector{Int},1:nworkers())
-            @test pmapsum(x->ones(2).*myid(),Vector{Int},1:nworkers()) isa Vector{Int}
-        end
-	end;
-
-	@testsetwithinfo "pmapreduce_commutative" begin
-	    @testsetwithinfo "batch" begin
-			@testset "sum" begin
-                res_exp = sum(workers())
-                @testset "without progress" begin
-                    res = pmapreduce_commutative(x->myid(),Int,sum,Int,1:nworkers())
-                    @test res == res_exp
-                    res = pmapreduce_commutative(x->myid(),sum,1:nworkers())
-                    @test res == res_exp
-                end
-                @testset "with progress" begin
-                    res = @test_deprecated pmapreduce_commutative(x->myid(),Int,sum,Int,1:nworkers(), showprogress=true)
-    			    @test res == res_exp
-                    res = @test_deprecated  pmapreduce_commutative(x->myid(),sum,1:nworkers(), showprogress=true)
-                    @test res == res_exp
-                end
-                @test pmapreduce_commutative(x->myid(),Int,sum,Int,(1:nworkers(),)) == res_exp
-			    @test pmapreduce_commutative(x->myid(),sum,(1:nworkers(),)) == res_exp
-			    @test pmapreduce_commutative(x->myid(),Int,sum,Int,(1:nworkers(),1:1)) == res_exp
-                @test pmapreduce_commutative(x->myid(),sum,(1:nworkers(),1:1)) == res_exp
-			    @test pmapreduce_commutative(x->myid(),sum,1:nworkers()) == pmapsum(x->myid(),1:nworkers())
-		    end
-		    @testset "prod" begin
-			    @test pmapreduce_commutative(x->myid(),prod,1:nworkers()) == prod(workers())
-			    @test pmapreduce_commutative(x->myid(),prod,(1:nworkers(),)) == prod(workers())
-			    @test pmapreduce_commutative(x->myid(),prod,(1:nworkers(),1:1)) == prod(workers())
-		    end
-
-		    @testsetwithinfo "run elsewhere" begin
-		    	res_exp = prod(workers())
-                c = Channel{Tuple{Int,Int,Bool}}(nworkers())
-                tasks = Vector{Task}(undef,nworkers())
-                @sync begin 
-                    for (ind,p) in enumerate(workers())
-                        tasks[ind] = @async begin
-                            try
-                                res = @fetchfrom p pmapreduce_commutative(x->myid(),prod,1:nworkers())
-                                put!(c,(ind,res,false))
-                            catch
-                                put!(c,(ind,0,true))
-                                rethrow()
-                            end
-                        end
-                    end
-                    for i = 1:nworkers()
-                        ind,res,err = take!(c)
-                        err && wait(tasks[ind])
-                        @test res == res_exp
-                        showworkernumber(i,nworkers())
-                    end
-                end
-		    end
-
-		    @testset "errors" begin
-		        @test_throws Exception pmapreduce_commutative(
-												x->error("map"),sum,1:10)
-		        @test_throws Exception pmapreduce_commutative(
-												identity,x->error("reduce"),1:10)
-				@test_throws Exception pmapreduce_commutative(
-												x->error("map"),x->error("reduce"),1:10)
-
-                @test_throws Exception pmapreduce_commutative(
-                                                x->fmap("map"),sum,1:10)
-                @test_throws Exception pmapreduce_commutative(
-                                                x->1,x->fred(x),1:10)
-                @test_throws Exception pmapreduce_commutative(
-                                                x->fmap(x),x->fred(x),1:10)
-		    end
-
-            @testset "type coercion" begin
-                @test_throws Exception pmapreduce_commutative(x->[1.1],Vector{Int},
-                                                sum,Vector{Int},1:nworkers())
-                res = pmapreduce_commutative(x->ones(2).*myid(),Vector{Int},sum,Vector{Int},1:nworkers())
-                @test res isa Vector{Int}
-            end
-		end;
-
-		@testsetwithinfo "elementwise" begin
-			@testset "comparison with map" begin
-			    iter = 1:1000
-                res_exp = sum(x->x^2,iter)
-                @testset "without progress" begin
-                    res = pmapreduce_commutative_elementwise(x->x^2,sum,iter)
-                    @test res == res_exp
-                end
-                @testset "with progress" begin
-    			    res = @test_deprecated pmapreduce_commutative_elementwise(x->x^2,sum,iter, showprogress=true)
-    			    @test res == res_exp
-                end
-			    @test res == pmapsum_elementwise(x->x^2,iter)
-			    @test res == pmapsum(plist->sum(x[1]^2 for x in plist),iter)
-			    res = pmapreduce_commutative_elementwise(x->x^2,sum,(iter,))
-			    @test res == res_exp
-                res = pmapreduce_commutative_elementwise(x->x^2,Int,sum,Int,(iter,))
-                @test res == res_exp
-                res = pmapreduce_commutative_elementwise(x->x^2,Int,sum,Int,iter)
-                @test res == res_exp
-                res = pmapreduce_commutative_elementwise(x->x^2,Int,x->float(sum(x)),Float64,iter)
-                @test res == float(res_exp)
-			end
-
-			@testsetwithinfo "run elsewhere" begin
-				iter = 1:1000
-				res_exp = sum(x->x^2,iter)
-                c = Channel{Tuple{Int,Int,Bool}}(nworkers())
-                tasks = Vector{Task}(undef,nworkers())
-                @sync begin 
-                    for (ind,p) in enumerate(workers())
-                        tasks[ind] = @async begin
-                            try
-                                res = @fetchfrom p pmapreduce_commutative_elementwise(x->x^2,sum,iter)
-                                put!(c,(ind,res,false))
-                            catch
-                                put!(c,(ind,0,true))
-                                rethrow()
-                            end
-                        end
-                    end
-                    for i = 1:nworkers()
-                        ind,res,err = take!(c)
-                        err && wait(tasks[ind])
-                        @test res == res_exp
-                        showworkernumber(i,nworkers())
-                    end
-                end
-		    end
-
-			@testsetwithinfo "errors" begin
-				@test_throws Exception pmapreduce_commutative_elementwise(
-												x->error("map"),sum,1:10)
-				@test_throws Exception pmapreduce_commutative_elementwise(
-												identity,x->error("reduce"),1:10)
-				@test_throws Exception pmapreduce_commutative_elementwise(
-												x->error("map"),
-												x->error("reduce"),1:10)
-			end
-		end;
-	end;
-    
-	@testsetwithinfo "pmapreduce" begin
-		@testsetwithinfo "batch" begin
-		    @testset "sum" begin
-                res_exp = sum(workers())
-                @testset "without progress" begin
-                    @test pmapreduce(x->myid(),Int,sum,Int,1:nworkers()) == res_exp
-                    @test pmapreduce(x->myid(),sum,1:nworkers()) == res_exp
-                end
-                @testset "without progress" begin
-                    res = @test_deprecated pmapreduce(x->myid(),Int,sum,Int,1:nworkers(), showprogress=true)
-                    @test res == res_exp
-                    res = @test_deprecated pmapreduce(x->myid(),sum,1:nworkers(), showprogress=true)
-                    @test res == res_exp
-                end
-			    @test pmapreduce(x->myid(),Int,sum,Int,(1:nworkers(),)) == res_exp
-			    @test pmapreduce(x->myid(),sum,(1:nworkers(),)) == res_exp
-                @test pmapreduce(x->myid(),Int,sum,Int,(1:nworkers(),1:1)) == res_exp
-			    @test pmapreduce(x->myid(),sum,(1:nworkers(),1:1)) == res_exp
-
-                @testset "comparison with pmapsum" begin
-                    res_exp = pmapsum(x->myid(),1:nworkers())
-                    @test pmapreduce(x->myid(),Int,sum,Int,1:nworkers()) == res_exp
-                    @test pmapreduce(x->myid(),sum,1:nworkers()) == res_exp
-                end
-		    end;
-
-		    @testset "concatenation" begin
-                res_vcat = ones(2*nworkers())
-                res_hcat = ones(2,nworkers())
-			    @test pmapreduce(x->ones(2),Vector{Float64},
-                    x->vcat(x...),Vector{Float64},1:nworkers()) == res_vcat
-                @test pmapreduce(x->ones(2),x->vcat(x...),1:nworkers()) == res_vcat
-			    @test pmapreduce(x->ones(2),Vector{Float64},
-                    x->hcat(x...),Matrix{Float64},1:nworkers()) == res_hcat
-                @test pmapreduce(x->ones(2),x->hcat(x...),1:nworkers()) == res_hcat
-
-                @testset "sorting" begin
-                    @test pmapreduce(x->ones(2)*x[1][1],x->vcat(x...),1:nworkers()) == 
-                            vcat((ones(2).*i for i=1:nworkers())...)
-
-                    @test pmapreduce(x->x[1][1],x->vcat(x...),1:nworkers()) == collect(1:nworkers())
-                    @test pmapreduce(x->myid(),Int,x->vcat(x...),Vector{Int},(1:nworkers(),)) == workers()
-                    @test pmapreduce(x->myid(),x->vcat(x...),1:nworkers()) == workers()
-                end
-
-                @testset "sumcat_aligned" begin
-                    @testset "sumcat_aligned" begin
-                        res_vcat = zeros(Float64, 2, 1:nworkers() + 3)
-                        for ind in 1:nworkers()
-                            res_vcat[:, ind .+ (0:3)] .+= 1
-                        end
-                        out = pmapreduce(x->ones(2, ParallelUtilities.workerrank(x) .+ (0:3)), 
-                            x->ParallelUtilities.sumcat_aligned(x..., dims=2), 1:nworkers())
-                        @test out == res_vcat
-
-                        res_vcat = zeros(Float64, 1:nworkers() + 3)
-                        for ind in 1:nworkers()
-                            res_vcat[ind .+ (0:3)] .+= 1
-                        end
-                        out = pmapreduce(x->ones(ParallelUtilities.workerrank(x) .+ (0:3)), 
-                            x->ParallelUtilities.sumcat_aligned(x..., dims=1), 1:nworkers())
-                        @test out == res_vcat
-
-                        r = 2:2 + nworkers()
-                        a1 = zeros(r, r)
-                        for ind in 1:nworkers()
-                            r = (1:2) .+ ind
-                            a1[r , r] .+= 1
-                        end
-
-                        a12 = pmapreduce(x->(r = ParallelUtilities.workerrank(x) .+ (1:2); ones(r,r)),
-                            x->ParallelUtilities.sumcat_aligned(x..., dims=(1,2)), 1:nworkers())
-                        @test a12 == a1
-                    end
-                    @testset "sumhcat_aligned" begin
-                        res_vcat = zeros(Float64, 2, 1:nworkers() + 3)
-                        for ind in 1:nworkers()
-                            res_vcat[:, ind .+ (0:3)] .+= 1
-                        end
-                        out = pmapreduce(x->ones(2, ParallelUtilities.workerrank(x) .+ (0:3)), 
-                            x->ParallelUtilities.sumhcat_aligned(x...), 1:nworkers())
-                        @test out == res_vcat
-                    end
-                    @testset "sumvcat_aligned" begin
-                        res_vcat = zeros(Float64, 1:nworkers() + 3)
-                        for ind in 1:nworkers()
-                            res_vcat[ind .+ (0:3)] .+= 1
-                        end
-                        out = pmapreduce(x->ones(ParallelUtilities.workerrank(x) .+ (0:3)), 
-                            x->ParallelUtilities.sumvcat_aligned(x...), 1:nworkers())
-                        @test out == res_vcat
-                    end
-                end
-		    end;
-
-			@testsetwithinfo "run elsewhere" begin
-                @testsetwithinfo "sum" begin
-    				res_exp = sum(workers())
-                    c = Channel{Tuple{Int,Int,Bool}}(nworkers())
-                    tasks = Vector{Task}(undef,nworkers())
-                    @sync begin 
-                        for (ind,p) in enumerate(workers())
-                            tasks[ind] = @async begin
-                                try
-                                    res = @fetchfrom p pmapreduce(x->myid(),sum,1:nworkers())
-                                    put!(c,(ind,res,false))
-                                catch
-                                    put!(c,(ind,0,true))
-                                    rethrow()
-                                end
-                            end
-                        end
-                        for i = 1:nworkers()
-                            ind,res,err = take!(c)
-                            err && wait(tasks[ind])
-                            @test res == res_exp
-                            showworkernumber(i,nworkers())
-                        end
-                    end
-                end
-                # concatenation where the rank is used in the mapping function
-                # Preserves order of the iterators
-                @testsetwithinfo "concatenation using rank" begin
-                    res_exp = collect(1:nworkers())
-                    c = Channel{Tuple{Int,Vector{Int},Bool}}(nworkers())
-                    tasks = Vector{Task}(undef,nworkers())
-                    @sync begin 
-                        for (ind,p) in enumerate(workers())
-                            tasks[ind] = @async begin
-                                try
-                                    res = @fetchfrom p pmapreduce(x->x[1][1],x->vcat(x...),1:nworkers())
-                                    put!(c,(ind,res,false))
-                                catch
-                                    put!(c,(ind,Int[],true))
-                                    rethrow()
-                                end
-                            end
-                        end
-                        for i = 1:nworkers()
-                            ind,res,err = take!(c)
-                            err && wait(tasks[ind])
-                            @test res == res_exp
-                            showworkernumber(i,nworkers())
-                        end
-                    end
-                end
-		    end;
-
-			@testset "errors" begin
-			    @test_throws Exception pmapreduce(x->error("map"),sum,1:10)
-				@test_throws Exception pmapreduce(identity,x->error("reduce"),1:10)
-				@test_throws Exception pmapreduce(x->error("map"),x->error("reduce"),1:10)
-                @test_throws Exception pmapreduce(x->fmap(x),sum,1:10)
-                @test_throws Exception pmapreduce(x->1,x->fred(x),1:10)
-                @test_throws Exception pmapreduce(x->fmap(x),x->fred(x),1:10)
-			end;
-
-            @testset "type coercion" begin
-                @test_throws Exception pmapreduce(x->[1.1],Vector{Int},sum,Vector{Int},1:nworkers())
-                @test pmapreduce(x->ones(2).*myid(),Vector{Int},sum,Vector{Int},1:nworkers()) isa Vector{Int}
-            end;
-		end;
-	end;
-end;
-
-@testset "show" begin
-
-    @testset "error" begin
-        io = IOBuffer()
-        
-        showerror(io,ParallelUtilities.TaskNotPresentError((1:4,),(5,)))
-        strexp = "could not find the task $((5,)) in the list $((1:4,))"
-        @test String(take!(io)) == strexp
-    end;
-
-    @testset "BranchChannel" begin
-        io = IOBuffer()
-
-        b = BranchChannel{Any,Any}(1,0)
-        show(io,b)
-        strexp = "Leaf  : 1 ← 1"
-        @test String(take!(io)) == strexp
-
-        b = BranchChannel{Any,Any}(1,1)
-        show(io,b)
-        strexp = "Branch: 1 ← 1 ← 1 child"
-        @test String(take!(io)) == strexp
-
-        b = BranchChannel{Any,Any}(1,2)
-        show(io,b)
-        strexp = "Branch: 1 ← 1 ⇇ 2 children"
-        @test String(take!(io)) == strexp
-    end;
-
-    @testset "BinaryTreeNode" begin
-        io = IOBuffer()
-        b = BinaryTreeNode(2,3,1)
-        show(io,b)
-        strexp = "BinaryTreeNode(p = 2, parent = 3, nchildren = 1)"
-        @test String(take!(io)) == strexp
-    end;
-end;
\ No newline at end of file