Question: is it possible to generate a reasonable sorting of soil profiles (a SoilProfileCollection) using pair-wise distances computed only from generalized horizon labels (GHL)? I think that it might, as long as these labels are encoded as ordered factors. We can use profile_compare to test this hypothesis. One might attempt such a comparison within a collection of similar soils where the only reliable data available are horizon depths and designations.

We will be using a subset of the the loafercreek sample data from the {soilDB} package.

library(aqp)
library(soilDB)
library(sharpshootR)

library(cluster)
library(dendextend)
library(MASS)

library(corrplot)

# example data
data("loafercreek")
hzdesgnname(loafercreek) <- 'hzname'

# focus on profiles 20-40
x <- loafercreek[20:40, ]

# modify default arguments for some flair
par(mar = c(0, 0, 0, 2))
plotSPC(x, width = 0.33, name.style = 'center-center', cex.names = 0.65, print.id = FALSE)

Apply generalized horizon labels using overly-simplistic rules. This is only a demonstration.

# new labels
n <- c('O', 'A', 'B', 'Bt', 'BC', 'Cr', 'R')
# GHL patterns
p <- c('O', 'A', 'B', 'Bt', 'BC', 'Cr', 'R')

# note additional argument
horizons(x)$genhz <- generalize.hz(x$hzname, new = n, pat = p)

# the most important step, genhz must be encoded as an ordered factor
x$genhz <- ordered(x$genhz)

# check frequency of GHL assignment
# note that some are missing
table(x$genhz)
## 
##        O        A        B       Bt       BC       Cr        R not-used 
##        3       22       18       66       11        8       21        2

Remove profiles containing horizons assigned to the “not-used” GHL.

x$flag <- profileApply(x, function(i) {
  any(i$genhz == 'not-used')
})

# SPC-aware version of subset()
y <- subset(x, !flag)

# check: ok
table(y$genhz)
## 
##        O        A        B       Bt       BC       Cr        R not-used 
##        3       20       16       59       11        8       19        0
# add short ID for checking our work
site(y)$shortID <- 1:length(y)

par(mar = c(0, 0, 3, 2))
plotSPC(y, color = 'genhz', width = 0.33, name.style = 'center-center', cex.names = 0.65, fixLabelCollisions = FALSE, col.label = 'Generalized Horizon Label', label = 'shortID')

Add empty horizons to bottom of the profiles with bottom-most horizon depth < 150cm, label with ‘R’ GHL. Truncate deeper profiles to 150cm. See ?fillHzGaps and ?trunc for details.

# extend with empty horizons
z <- fillHzGaps(y, to_top = NA, to_bottom = 150)

# label as 'R'
# important to use one of the GHL specified in `genhz`
z$hzname[is.na(z$genhz)] <- 'R'
z$genhz[is.na(z$genhz)] <- 'R'

# truncate all profiles to 150cm
z <- trunc(z, z1 = 0, z2 = 150)

par(mar = c(0, 0, 3, 2))
plotSPC(z, color = 'genhz', width = 0.33, name.style = 'center-center',  cex.names = 0.65, col.label = 'Generalized Horizon Label', label = 'shortID')

Arrange profiles by depth to contact using plot.order argument to plotSPC. See ?getSoilDepthClass and ?estimateSoilDepth for details on estimating soil depth.

# add soil depth information to site
sdc <- getSoilDepthClass(z)
site(z) <- sdc

par(mar = c(0, 0, 3, 2))
plotSPC(z, color = 'genhz', width = 0.33, name.style = 'center-center', cex.names = 0.65, plot.order = order(z$depth), col.label = 'Generalized Horizon Label', label = 'shortID')

Compute pair-wise distances between profiles using genhz (an ordered factor) to a maximum depth of 150cm. See ?profile_compare and ?daisy for details. Divisive hierarchical clustering is applied to the distance matrix, with a final re-ordering step according to soil depth (dendextend::rotate).

# aqp >= 2.0
d <- NCSP(z, vars = 'genhz', maxDepth = 150, k = 0, rescaleResult = TRUE)

# divisive hierarchical clustering
h <- as.hclust(diana(d))

# attempt sorting dendrogram by soil depth
h <- dendextend::rotate(h, order(z$depth))

As suggested by Shawn Salley, a quick view of the distance matrix as an image. Pair-wise distances are encoded with color.

# expand dist object to full matrix form of the pair-wise distances 
m <- as.matrix(d)
# copy short IDs from Soil Profile Collection to full distance matrix
dimnames(m) <- list(z$shortID, z$shortID)

# invert device foreground / background colors for an artistic effect
# use colors from The Life Aquatic
par(bg = 'black', fg = 'white')
corrplot(
  m, 
  col = hcl.colors(n = 25, palette = 'zissou1'), 
  is.corr = FALSE, 
  col.lim = c(0, 1), 
  method = "color", 
  order = "original",
  type = "upper", 
  # tl.pos = "n",
  # cl.pos = "n",
  mar = c(0.1, 0, 0, 0.8)
) 

Rank, left to right, profiles according to median distance to all other profiles in the collection. Profiles 17, 6, 4, or 13 might be good candidates for typical pedons.

# convert to full matrix representation
m <- as.matrix(d)

# copy shorter IDs from SPC
dimnames(m) <- list(z$shortID, z$shortID)

# mask diagonal and lower triangle with NA
# ignoring these in subsequent steps
m[upper.tri(m, diag = TRUE)] <- NA

# extract distances to all other profiles
dl <- lapply(1:nrow(m), FUN = function(i) {
  .r <- c(m[i, ], m[, i])
  return(as.vector(na.omit(.r)))
})

# copy names
names(dl) <- dimnames(m)[[1]]

# compute median distance to all other profiles
(med.dist <- sort(sapply(dl, median, na.rm = TRUE)))
##        17         6         4        13        19        14         5         3        15         1 
## 0.2615694 0.2635815 0.2726358 0.2756539 0.2877264 0.2907445 0.3138833 0.3138833 0.3259557 0.3269618 
##        16         2        10        11        18        12         7         8         9 
## 0.3551308 0.3651911 0.3822938 0.3923541 0.4577465 0.4989940 0.5090543 0.5090543 0.6418511
# sorting vector for plotting sketches
idx <- match(names(med.dist), z$shortID)

par(mar = c(4, 0, 3, 2))
plotSPC(z, plot.order = idx, color = 'genhz', width = 0.33, name.style = 'center-center', cex.names = 0.65, col.label = 'Generalized Horizon Label', label = 'shortID')

axis(1, at = 1:length(z), labels = round(med.dist, 3), line = 0, cex.axis = 0.85)
mtext("Median distance to all other profiles (not to scale)", side = 1, line = 2.25)

Another view of median distance to all other profiles, as a set of connected points. This helps demonstrate the scale of differences within the collection. Note that values in the distance matrix d range from 0 to 1. We have to use some tricks to get the vertical spacing right.

# trick 1: adjust median distances to fit "above" profiles
# shift down 125cm
# rescale from 0 to 125cm
.yoff <- 125
med.dist.rescale <- .yoff - aqp:::.rescaleRange(med.dist, 0, .yoff)

# trick 2: plot profiles, shifting "down" 150cm
# use y.offset argument
par(mar = c(0.25, 1.5, 3, 1.5))
plotSPC(z, plot.order = idx, color = 'genhz', width = 0.33, name.style = 'center-center', cex.names = 0.65, col.label = 'Generalized Horizon Label', label = 'shortID', scaling.factor = 1, y.offset = 150)

# add the scaled + shifted median distances as points connected by lines
lines(x = 1:length(z), y = med.dist.rescale, type = 'b')

# trick 3: make a special axis describing median distance
# compute "pretty" intervals for the axis, on the shifted and scaled values
.p <- pretty(med.dist.rescale)

# use a linear model to convert between shifted/scaled and original values
# slope ~ scale
# intercept ~ shift
.m <- lm(med.dist ~ med.dist.rescale)

# convert pretty axis values back to original scale
.lab <- predict(.m, data.frame(med.dist.rescale = .p))

# add axis
axis(side = 2, at = .p, labels = round(.lab, 2), line = -1.5, las = 1, cex.axis = 0.75)

# annotate
mtext('Median distance to all other profiles (max = 1)', side = 3, at = 1, adj = 0, line = -2)

Use plotProfileDendrogram() from the sharpshootR package to align profile sketches with the terminal leaves of a dendrogram. When applied to other data, you may have to tinker with the scaling.factor and y.offset arguments.

par(mar = c(1, 0, 3, 2))
plotProfileDendrogram(z, clust = h, scaling.factor = 0.0095, y.offset = 0.15, color = 'genhz', width = 0.33, name.style = 'center-center', col.label = 'Generalized Horizon Label', cex.names = 0.65, label = 'shortID')

Use distance from 1st profile as a virtual transect via alignTransect, fixOverlap attempts to spread-out profiles along the x-axis to reduce overlap.

m <- as.matrix(d)

set.seed(10101)
pos <- alignTransect(m[1, ], x.min = 1, x.max = length(z), thresh = 0.5)

Distance from 1st profile (left-most, marked with a red asterisk) used to order collection along an integer sequence.

par(mar = c(4, 0, 3, 2))

plotSPC(z, plot.order = pos$order, color = 'genhz', width = 0.33, name.style = 'center-center', cex.names = 0.65, col.label = 'Generalized Horizon Label', label = 'shortID')

points(x = 1, y = -15, pch = '*', cex = 3, col = 'firebrick')

axis(1, at = 1:length(z), labels = round(pos$grad, 3), line = 0, cex.axis = 0.85)
mtext("Distance from 1st Profile (not to scale)", side = 1, line = 2.25)

Distance from 1st profile (left-most, marked with a red asterisk) used to place profiles at relative distances along the x-axis.

par(mar = c(4, 0, 3, 2))

plotSPC(z, plot.order = pos$order, relative.pos = pos$relative.pos, color = 'genhz', width = 0.2, name.style = 'center-center', name = NA, col.label = 'Generalized Horizon Label', label = 'shortID')

points(x = 1, y = -15, pch = '*', cex = 3, col = 'firebrick')

axis(1, at = pos$relative.pos, labels = round(pos$grad, 3), line = 0, cex.axis = 0.66)
mtext("Distance from 1st Profile", side = 1, line = 2.25)

Create an ordination from pair-wise distances using non-metric multidimensional scaling (MASS::sammon). It is necessary to add a small amount of distance back to the distance matrix before ordination for two reasons:

  1. we chose to rescale distances to the interval of [0,1] (recall rescaleResult = TRUE argument to NCSP())
  2. there are duplicate soil profiles in the collection
# add tiny amount for 0-distance pairs (duplicate profiles)
d <- d + 0.001

# extract nMDS scores from result
mds <- sammon(d)
## Initial stress        : 0.02655
## stress after  10 iters: 0.00931, magic = 0.500
## stress after  20 iters: 0.00908, magic = 0.500
## stress after  30 iters: 0.00905, magic = 0.500
mds <- mds$points

# ensure ordering is preserved:
# profile IDs -> distance matrix -> nMDS
all(row.names(mds) == profile_id(z))
## [1] TRUE

Re-scale nMDS coordinates to typical coordinate system used by plotSPC.

# re-scale nMDS axis 1 to the typical horizontal scale used by plotSPC 
xoff <- aqp:::.rescaleRange(mds[, 1], x0 = 1, x1 = length(z))

# re-scale nMDS axis 2 to the typical vertical scale used by plotSPC  
yoff <- aqp:::.rescaleRange(mds[, 2], x0 = -10, x1 = max(z))

Place (“hang”) soil profile sketches according to nMDS ordination (points). Overlap is a problem.

par(mar = c(0.25, 0.25, 3.5, 0.25))
# suppress horizon names with `names = NA`
plotSPC(z, y.offset = yoff, relative.pos = xoff, color = 'genhz', width = 0.25, name.style = 'center-center', name = NA, col.label = 'Generalized Horizon Label', label = 'shortID', scaling.factor = 0.75)

points(xoff, yoff, pch = 16, cex = 0.66)

grid(nx = 10, ny = 10)
abline(v = mean(xoff), h = mean(yoff), lty = 2)

box()

Demonstrate the effect of fixOverlap() on relative positions along nMDS axis 1.

# use re-scaled nMDS coordinates as virtual transect
# adjust to reduce overlap
# with an expansion of the x-axis out to length(z) + 5
set.seed(10110)
xoff.fixed <- fixOverlap(xoff, thresh = 0.65, min.x = 1, max.x = length(z) + 5, method = 'S')

# x,y coordinates (offsets) from nMDS axes 1,2
par(mar = c(3, 3, 1, 1))
plot(cbind(xoff, yoff), pch = 1, las = 1, axes = FALSE, ylab = '', xlab = '', xlim = c(min(xoff), max(xoff.fixed)))

# annotate with short IDs
text(cbind(xoff, yoff), labels = z$shortID, cex = 0.66, pos = 1, font = 3)

grid(nx= 10, ny = 10)
mtext('nMDS axis 1', side = 1, line = 1)
mtext('nMDS axis 2', side = 2, line = 1)

# adjusted x offsets, thanks to fixOverlap()
points(cbind(xoff.fixed, yoff), pch = 16)

# annotate with short IDs
text(cbind(xoff.fixed, yoff), labels = z$shortID, cex = 0.66, pos = 3, font = 2)

# arrows connect original, scaled nMDS axis 1 -> adjusted-for-overlap nMDS axis 1
arrows(
  x0 = xoff, y1 = yoff,
  x1 = xoff.fixed, y0 = yoff,
  len = 0.05  
)

legend('bottomleft', legend = c('original', 'adjusted'), pch = c(1, 16), box.col = 'white', text.font = c(3,2))
box()

Place soil profile sketches according to approximate positions within nMDS ordination (points). Note that y-offsets (nMDS axis 2 are inverted, due to “0 is at the top” depth logic used by plotSPC()). A very large output device is required for legible text.

par(mar = c(0.25, 0.25, 3.5, 0.25))
plotSPC(z, y.offset = yoff, relative.pos = xoff.fixed, color = 'genhz', width = 0.25, name.style = 'center-center', cex.names = 0.45, hz.depths = TRUE, col.label = 'Generalized Horizon Label', label = 'shortID', scaling.factor = 0.75)

# add adjusted nMDS points
points(xoff.fixed, yoff, pch = 16, cex = 0.75)

box()

Another approach, leaving out all of the text annotation. Morphologic patterns and their similarity are clear.

par(mar = c(0.25, 0.25, 3.5, 0.25))
plotSPC(z, y.offset = yoff, relative.pos = xoff.fixed, color = 'genhz', width = 0.25, name.style = 'center-center', hz.depths = FALSE, name = NA, col.label = 'Generalized Horizon Label', label = 'shortID', scaling.factor = 0.75)

# grid
grid(nx = 10, ny = 10)

# annotate origin of nMDS axes
points(x = mean(xoff), y = mean(yoff), pch = 3, col = 'black', lwd = 2)

box()


This document is based on aqp version 2.0.3.