1 Introduction

The concept of an “L1 profile” is based on the multivariate median, also referred to as the “geometric median” or “L1 median”. The L1 median represents an n-dimensional point that minimizes some distance metric to all other points in a data set. Consider the following 2-dimensional data (black points), estimated marginal medians (blue), and L1 median (red). The marginal medians are computed independently along each dimension which the L1 median is computed across all dimensions.

The “L1 profile” is a synthetic soil profile built from L1 median values along 1cm depth intervals. Consider 10 simulated profiles, each with 3 soil properties, all sharing a relatively common distribution with depth.

2 Demonstration

Demonstrate with real data.

library(aqp)
library(soilDB)
library(sharpshootR)
library(cluster)
library(reshape2)
library(latticeExtra)
library(Gmedian)
library(MASS)
library(tactile)

# get some interesting data
x <- fetchKSSL('plainfield')

# remove partial-profile records
x <- subset(x, pedon_completeness_index > 3)
x <- subset(x, pedon_key != '6678')

# this is rather drastic, consider other possible "repairs"
# fillHzGaps(x), accumulateDepths(x), repairMissingHzDepths(x)
# remove problematic horizonation
x <- HzDepthLogicSubset(x)

# normalize taxonname for use as a grouping variable
x$taxonname <- tolower(x$taxonname)

# compute weighted-mean particle diameter (mm) for later
x$wmpd <- with(horizons(x), ((vcs * 1.5) + (cs * 0.75) + (ms * 0.375) + (fs * 0.175) + (vfs *0.075) + (silt * 0.026) + (clay * 0.0015)) / (vcs + cs + ms + fs + vfs + silt + clay))

3 Aggregation

site(x)$group <- 'L1 Median'

# aggregation formula
fm <- group ~ sand + silt + clay + estimated_ph_h2o + bs82 + wmpd

# L1 medians
z <- L1_profiles(x, fm = fm, method = 'constant', maxDepthConstant = 150)

# marginal medians
a <- slab(x, fm = fm)

3.1 Sanity Check

# check that sand + silt + clay sum to 100
(z$ssc_total <- z$sand + z$silt + z$clay)
##   [1] 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100
##  [24] 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100
##  [47] 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100
##  [70] 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100
##  [93] 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100
## [116] 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100
## [139] 100 100 100 100 100 100 100 100 100 100 100 100 100

3.2 Merge Marginal + L1

## TODO: this involves a lot of dumb re-naming
vars <- c("sand", "silt", "clay", "estimated_ph_h2o", "bs82", "wmpd")
z.long <- reshape2::melt(as(z, 'data.frame'), id.vars = c(idname(z), horizonDepths(z)), measure.vars = vars)

new.vars <- c('variable', 'group', 'top', 'bottom', 'p.q50', 'p.q25', 'p.q75', 'contributing_fraction')

# re-name and adjust to match slab output
names(z.long)[1] <- 'group'
names(z.long)[5] <- 'p.q50'
z.long$p.q25 <- NA
z.long$p.q75 <- NA
# not sure how we can best utilize this
z.long$contributing_fraction <- 0
z.long <- z.long[, new.vars]

g <- make.groups(
  marginal = a[, new.vars],
  L1 = z.long
)

3.3 Graphical Comparison

# plotting style
tps <- tactile.theme(superpose.line = list(col = c('RoyalBlue', 'DarkRed', 'DarkGreen'), lwd = 2))

# fancy panel names
levels(g$variable) <- c('Sand (%)', 'Silt (%)', 'Clay (%)', 'pH 1:1 H2O', 'BS at pH 8.2 (%)', 'WMPD (mm)')

# plot grouped, aggregate data
xyplot(top ~ p.q50 | variable, groups = which, data=g, ylab='Depth',
       main = 'Plainfield Lab Data',
       xlab='median bounded by 25th and 75th percentiles',
       lower = g$p.q25, upper = g$p.q75, ylim = c(155,-5),
       panel = panel.depth_function, alpha=0.25, sync.colors=TRUE,
       prepanel = prepanel.depth_function,
       # cf = g$contributing_fraction,
       par.strip.text=list(cex=0.8),
       strip=strip.custom(bg=grey(0.85)),
       layout=c(6,1), scales=list(x=list(alternating=1, relation='free'), y=list(alternating=3)),
       auto.key = list(lines = TRUE, points = FALSE, columns = 2),
       par.settings = tps
)