1 Introduction

This analysis looks at the field data collected for the following 2017 MLRA project: “MLRA 111B - Glynwood B-slope Erosion; Northeastern IN”

This project has several objectives: 1. Resolve conflicts between county boundaries in Northeastern IN, where different erosion phases join. 2. Develop an operational definition of erosion classes. 3. Spatially disaggregate the existing SSURGO polygons for Glynwood B-slope map units, in order to separate different soil erosion classes within a polygon. 4. Develop a raster product of spatial disaggregated erosion classes that can be incorporated into gSSURGO. 5. Confirm that the existing soil component horizon data adequately matches the field data.

The problem with the current Glynwood map units is that they were developed using different concepts of erosion classes. This resulted in the discrepency between county boundaries. By developing a raster model we should be able to separate out different erosion classes within individual polygons, evaluate their composition, ranked them according to their percentage of severe erosion, and correlated them to the correct MLRA map unit. In addition, the detailed raster product could be incorporated into gSSURGO and delivered via the Geospatial Gateway. This project is deemed relevant due to the effect of erosion classes on crop yields, which is particularly pronounced during periods of drought. Given the current interest in Soil Health, this new information should provide an estimate of Glynwood’s potential.

2 Data Preparation

library(reshape2)
library(ggplot2)
library(gridExtra)
library(knitr)

library(cluster)
library(caret)
library(party)
library(vegan)

library(rgdal)
library(sp)
library(sf)
library(raster)
library(mapview)
library(gdalUtils)

library(aqp)
library(soilDB)

2.1 Polygon Data

# load soil series extent of Glynwood from SoilWeb
gw_series <- seriesExtent("Glynwood")
gw_series <- st_as_sf(gw_series)


# load project from NASIS
project <- get_projectmapunit_from_NASIS()
project_nodups <- project[!duplicated(project$nationalmusym), c("nationalmusym", "muname")]


# load mupolygon from geodatabase
mupolygon <- read_sf(dsn = paste0(geodata, "soils/2017/MLRA_11_FIN_FY17.gdb"), layer = "MUPOLYGON")
st_crs(mupolygon) <- st_crs("+init=epsg:5070")
mupolygon_sub <- mupolygon[mupolygon$MUKEY %in% project$mukey, ]


# load sapolygon from geodatabase
sapolygon <- read_sf(dsn = paste0(geodata, "soils/2017/MLRA_11_FIN_FY17.gdb"), layer = "SAPOLYGON")
st_crs(sapolygon) <- st_crs("+init=epsg:5070")
sapolygon_sub <- sapolygon[sapolygon$AREASYMBOL %in% unique(project$areasymbol), ]
sapolygon_sub <- st_transform(sapolygon_sub, crs = "+init=epsg:4326")


# load site table from NASIS
gw_s <- get_site_data_from_NASIS_db()
gw_s <- gw_s[complete.cases(gw_s[c("x", "y")]), ]

gw_sp <- gw_s
coordinates(gw_sp) <- ~ x + y
proj4string(gw_sp) <- CRS("+init=epsg:4326")

gw_sf <- st_as_sf(gw_sp)
gw_sf <- st_transform(gw_sf, crs = "+init=epsg:5070")


# intersect mupolygons and points
mupolygon_sub2 <- mupolygon_sub[mupolygon_sub$AREASYMBOL %in% project$areasymbol, ]
idx <- st_intersects(gw_sf, mupolygon_sub2)
idx2 <- lapply(idx, function(x) ifelse(length(x) == 0, NA, x))
mupolygon_int <- mupolygon_sub2[unlist(idx2), ]
mupolygon_int$upedonid <- gw_sf$pedon_id


# save and export files
write_sf(mupolygon_sub, 
         dsn = paste0(ownCloud, "mupolygon.shp"), 
         layer = "mupolygon", 
         driver = "ESRI Shapefile",
         over
         )
save(mupolygon_int, mupolygon_sub, sapolygon_sub, gw_series, file = paste0(ownCloud, "glynwood_pol.RData"))

2.2 Pedon Data

The pedon data for this project has been entered into NASIS. Also the site observation table was populated with the projectname to make the data easy to query. Originally the data was exported from NASIS using the NASIS report “PEDON - ArcSIE Data Dump 11-FIN v2.0” from the MLRA11_Indianapolis Group folder, and was titled “Pts_gnbero_27Jan17.csv”. In more recent iterations of the analysis the data was loaded and processed in R.

# load cahced polygon data
load(file = paste0(ownCloud, "glynwood_pol.RData"))


# load pedons from NASIS
gp <- fetchNASIS()

vars <- c("peiid", "pedon_id", "taxonname", "x", "y", "describer", "erocl")
s <- site(gp)[vars]
h <- horizons(gp)


# extract relevant fields from the horizon data
g_vars <- c("peiid")
A_vars <- c("hzdept", "clay", "texture", "m_hue", "m_value", "m_chroma")
Bt_vars <- c("hzdept", "clay", "texture", "m_value", "m_chroma")
carb_vars <- c("hzdept", "effervescence")
solum_vars <- c("hzdept")

h2 <- by(h, h[g_vars], function(x) data.frame(
  # grouping variable
  x[g_vars][1, ],
  # A horizon data
  x[A_vars][1, ],
  # Bt horizon data
  x[grepl("Bt", x$hzname), Bt_vars, ][1, ],
  # CaCO3 data
  x[x$effervescence %in% c("strong", "violent"), carb_vars][1,],
  # solum depth
  x[grepl("^C|^2C|^3C", x$hzname), solum_vars][1]
  ))
h2 <- do.call("rbind", h2)

names(h2) <- c(g_vars, "A_hzthk", "A_clay", "A_texture", "A_m_hue", "A_m_value", "A_m_chroma", "topsoil_thk", "Bt_clay", "Bt_texture", "Bt_m_value", "Bt_m_chroma", "CaCO3_dep", "effervescence", "solum_dep")
names(s)[c(2:5, 7)]  <- c("upedonid", "soilname", "long", "lat", "erocl")

h2 <- within(h2,{
             CaCO3_dep[is.na(CaCO3_dep)] <- 200
             solum_dep[is.na(solum_dep)] <- 200
             })

gw1 <- merge(s, h2, by = "peiid", all.x = TRUE)
gw1 <- gw1[complete.cases(gw1[c("lat", "long")]), ]


# load original field data with SIE predictions
gw2 <- read.csv(paste0(ownCloud, "Pts_gnbero_27Jan17.csv"))
vars <- c("upedonid", "EroClassSIE", "relpos", "SlopeSIE", "wetness", "PlanCrv", "ProfCrv", "maxcrv", "mincrv")
gw <- merge(gw1, gw2[vars], by = "upedonid", all.x = TRUE)
names(gw)[names(gw) == "EroClassSIE"] <- "erocl_sie"

# extract mupolygon data from field coordinates and merge with mapunit data from NASIS
vars <- c("AREASYMBOL", "nationalmusym", "MUNAME", "upedonid")
test <- data.frame(mupolygon_int)[vars]
gw <- merge(gw, test, by = "upedonid", all.x = TRUE)

# erosion labels
ero_labels <- c("undisturbed", "slight", "moderate", "severe")

# transform and compute new variables
gw <- within(gw, {
  # convert erosion codes to labels
  erocl  = factor(erocl, levels = 0:3, labels = ero_labels)
  erocl_sie = factor(erocl_sie, levels = 0:3, labels = ero_labels)
  erocl2 = ifelse(erocl == "severe", "severe", "slight")
  
  # extract erosion phases from mapunit names
  EroClassNASIS = NA
  EroClassNASIS[grepl("eroded", MUNAME)] = "eroded"
  EroClassNASIS[grepl("sev.|severely", MUNAME)] = "sev.eroded"
  EroClassNASIS[!grepl("eroded", MUNAME)] = "non.eroded"
  
  # extract landform from mapunit names
  landform = NA
  landform[grepl("ground moraine", MUNAME)] = "ground"
  landform[grepl("end moraine", MUNAME)] =  "end"
  
  # correlate field taxonnames to mapunit components
  soilname2 = soilname
  soilname2 = ifelse(soilname2 %in% c("Glynwood", "Morley", "Shinrock", "Rawson", "Mississinewa"), "Glynwood", soilname2)
  soilname2 = ifelse(soilname2 %in% c("Blount", "Elliott"), "Blount", soilname2)
  soilname2 = ifelse(soilname2 %in% c("Pewamo", "Pandora", "Mermill"), "Pewamo", soilname2)
  soilname3 = paste0(soilname2, ifelse(soilname2 == "Glynwood", paste0("-", erocl2), ""))
  
  # difference between A and Bt horizons
  clay_dif = Bt_clay - A_clay
  tex_dif  = Bt_texture == A_texture
  dep_dif  = topsoil_thk - 15
  value_dif = Bt_m_value - A_m_value
  chroma_dif = Bt_m_chroma - A_m_chroma
  })

# convert transformed field dataset to a spatial object
gw_sp <- gw
coordinates(gw_sp) <- ~ long + lat
proj4string(gw_sp) <- CRS("+init=epsg:4326")

gw_sf <- st_as_sf(gw_sp)

gw_sp <- spTransform(gw_sp, CRS("+init=epsg:5070"))

writeOGR(gw_sp, dsn = paste0(ownCloud, "points.shp"), layer = "points", driver = "ESRI Shapefile", overwrite_layer = TRUE)

2.3 Spatial Data

The geodata from the Glynwood points was extracted from several rasters at various resolutions. The data using to generate the ArcSIE model came from a DEM with a resolution of 15-feet. The other geodata came from the 10-meter USGS NED, which was primarily resampled from LiDAR.

# Extract data from rasters
# NW files
fd <- paste0(geodata, "project_data/11FIN/PointDataEval/")
dd <- c("slope10",
        "procrv10",
        "plncrv10",
        "maxcrv10",
        "mincrv10",
        "relpos_r5",
        "wetness_mp"
        )
fp <- paste0(fd, "Mosaic_NW_pts/Derivatives/", dd, "/", "w001001.adf")
rs <- stack(fp)
names(rs) <- dd
proj4string(rs) <- CRS("+init=epsg:2965")
gd_nw <- extract(rs, gw_sp, df = TRUE, sp = TRUE)@data
gd_nw <- subset(gd_nw, !is.na(slope10))

# SE files
fp <- paste0(fd, "Mosaic_SE_pts/Derivatives/", dd, "/", "w001001.adf")
rs <- stack(fp)
names(rs) <- dd
proj4string(rs) <- CRS("+init=epsg:2965")
gd_se <- extract(rs, gw_sp, df = TRUE, sp = TRUE)@data
gd_se <- subset(gd_se, !is.na(slope10))

gd15ft <- rbind(gd_nw, gd_se)
rm(gd_nw, gd_se)
write.csv(gd15ft, file = paste0(ownCloud, "geodata_15ft_extract.csv"))


# Region 11 files

subset_rasters <- function(input, output) {
  cat(paste0(input, "\n"))
  gdal_translate(
    src_dataset = input,
    dst_dataset = output,
    projwin = c(bb[1], bb[4], bb[3], bb[2]),
    of = "GTiff",
    a_nodata = -99999,
    overwrite = TRUE,
    verbose = TRUE
    )
  }

warp_rasters <- function(input, output){
  cat(paste0(input,"\n"))
  gdalwarp(
    srcfile = input,
    dstfile = output,
    te = bb,
    s_srs = CRSargs(CRS("+init=epsg:5070")),
    t_srs = CRSargs(CRS("+init=epsg:5070")),
    r = "bilinear",
    tr = c(10, 10),
    of = "GTiff",
    overwrite = TRUE,
    verbose = TRUE 
    )
  }

fd <- paste0(geodata, "project_data/11FIN/")
dd2 <- c("ned10m_11FIN.tif",
        "ned10m_11FIN_aspect5.tif",
        "ned10m_11FIN_slopeshape.tif",
        "ned30m_11FIN_mvalleys.tif",
        "ned30m_11FIN_wetness.tif",
        "ned30m_11FIN_z2stream.tif",
        "nlcd30m_11FIN_lulc2011.tif"
        )
dd_names <- c("elev", "aspect5", "ss", "mvalley", "wetness2", "z2streams", "lulc")
dd <- paste0(fd, dd2)

input <- dd
output <- paste0(geodata, "project_data/11FIN/glynwood/", gsub(".sdat", ".tif", dd2))

mupolygon_in <- mupolygon_sub[grepl("^IN", mupolygon_sub$AREASYMBOL), ]
bb <- st_bbox(st_transform(mupolygon_in, crs = "+init=epsg:5070"))
# bb <- st_bbox(st_transform(gw_sf, crs = "+init=epsg:5070"))

mapply(subset_rasters, input, output)
mapply(subset_rasters, 
       input = paste0(geodata, "project_data/11FIN/nlcd30m_11FIN_lulc2011.tif"), 
       output = paste0(geodata, "project_data/11FIN/glynwood/nlcd30m_11FIN_lulc2011.tif")
       )
mapply(warp_rasters, input = output, output = gsub(".tif", "2.tif", gsub("30m", "10m", output)))

dd <- output
 
rs10m <- stack(dd[grepl("10m", dd)])
names(rs10m) <- c(dd_names[1:2], "kp5", "kt5", "slope5")
rs30m <- stack(dd[grepl("30m", dd)])
names(rs30m) <- dd_names[4:7]

rs10m <- stack(gsub(".tif", "2.tif", gsub("30m", "10m", output)))
names(rs10m) <- c(dd_names[1:2], "kp5", "kt5", "slope5", dd_names[4:7]) 

gd10m <- as.data.frame(extract(rs10m, gw_sp, df = TRUE, sp = TRUE))
gd30m <- as.data.frame(extract(rs30m, gw_sp, df = TRUE))
gw <- cbind(gd10m, gd30m[, -1])
rm(gd10m, gd30m)


# Save data
save(gw, gw_sf, gw_sp, ero_labels, file = paste0(ownCloud, "glynwood_geodata.RData"))

3 Plot Pedon Data, County Boundaries, and Series Extent

# Load cached dataset
load(paste0(ownCloud, "glynwood_pol.RData"))
load(paste0(ownCloud, "glynwood_geodata.RData"))


# Create interactive map
mapView(gw_series) + sapolygon_sub + gw_sf

4 Map Units vs Field Observations

vars <- c("erocl", "EroClassNASIS", "nationalmusym", "AREASYMBOL")
gw_sub <- gw[vars]
gw_dup <- gw[vars]

# Frequency of field observation vs map unit
# Duplicate the data for each REASYMBOL and relabel MLRA
gw_dup["AREASYMBOL"] <- "MLRA"
gw_dup <- rbind(gw_sub, gw_dup)
gw_dup$natmuSsaEro <-  with(gw_dup,
                            paste0(nationalmusym, "-", AREASYMBOL, "-", EroClassNASIS)
                            )
test <- xtabs(~ natmuSsaEro + erocl, data = gw_dup)
kable(test, caption = "Frequence by MUSYM-SSA-EROSION")

Frequence by MUSYM-SSA-EROSION
	undisturbed	slight	moderate	severe
2t6ll-IN009-sev.eroded	13	15	17	3
2t6ll-IN053-sev.eroded	4	13	18	13
2t6ll-IN075-sev.eroded	6	8	4	5
2t6ll-IN179-sev.eroded	1	9	6	10
2t6ll-MLRA-sev.eroded	24	45	45	31
2t6lm-IN009-sev.eroded	2	11	10	1
2t6lm-IN053-sev.eroded	2	4	8	11
2t6lm-IN075-sev.eroded	3	1	11	9
2t6lm-IN179-sev.eroded	9	8	0	13
2t6lm-MLRA-sev.eroded	16	24	29	34
2v4bn-IN069-eroded	5	4	7	6
2v4bn-IN179-eroded	0	3	4	6
2v4bn-MLRA-eroded	5	7	11	12
2v4bp-IN179-eroded	0	3	0	2
2v4bp-MLRA-eroded	0	3	0	2
5jjt-IN035-sev.eroded	1	1	2	9
5jjt-MLRA-sev.eroded	1	1	2	9
NA-MLRA-non.eroded	6	9	10	34
NA-NA-non.eroded	6	9	10	34

kable(round(prop.table(test, 1) * 100), caption = "Percent by MUSYM-SSA-EROSION")

Percent by MUSYM-SSA-EROSION
	undisturbed	slight	moderate	severe
2t6ll-IN009-sev.eroded	27	31	35	6
2t6ll-IN053-sev.eroded	8	27	38	27
2t6ll-IN075-sev.eroded	26	35	17	22
2t6ll-IN179-sev.eroded	4	35	23	38
2t6ll-MLRA-sev.eroded	17	31	31	21
2t6lm-IN009-sev.eroded	8	46	42	4
2t6lm-IN053-sev.eroded	8	16	32	44
2t6lm-IN075-sev.eroded	12	4	46	38
2t6lm-IN179-sev.eroded	30	27	0	43
2t6lm-MLRA-sev.eroded	16	23	28	33
2v4bn-IN069-eroded	23	18	32	27
2v4bn-IN179-eroded	0	23	31	46
2v4bn-MLRA-eroded	14	20	31	34
2v4bp-IN179-eroded	0	60	0	40
2v4bp-MLRA-eroded	0	60	0	40
5jjt-IN035-sev.eroded	8	8	15	69
5jjt-MLRA-sev.eroded	8	8	15	69
NA-MLRA-non.eroded	10	15	17	58
NA-NA-non.eroded	10	15	17	58

Several of counties phased severely eroded, are not dominanted by field observations classified as severely eroded.

5 Accuracy Assessment of the ArcSIE Predictions

cm <- confusionMatrix(data = gw$erocl_sie, reference = gw$erocl)
print(cm)

## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    undisturbed slight moderate severe
##   undisturbed           0      0        0      0
##   slight                9      6       10      9
##   moderate             27     35       51     37
##   severe               12     31       21     45
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3481          
##                  95% CI : (0.2937, 0.4057)
##     No Information Rate : 0.3106          
##     P-Value [Acc > NIR] : 0.09339         
##                                           
##                   Kappa : 0.0853          
##  Mcnemar's Test P-Value : 7.635e-15       
## 
## Statistics by Class:
## 
##                      Class: undisturbed Class: slight Class: moderate
## Sensitivity                      0.0000       0.08333          0.6220
## Specificity                      1.0000       0.87330          0.5308
## Pos Pred Value                      NaN       0.17647          0.3400
## Neg Pred Value                   0.8362       0.74517          0.7832
## Prevalence                       0.1638       0.24573          0.2799
## Detection Rate                   0.0000       0.02048          0.1741
## Detection Prevalence             0.0000       0.11604          0.5119
## Balanced Accuracy                0.5000       0.47832          0.5764
##                      Class: severe
## Sensitivity                 0.4945
## Specificity                 0.6832
## Pos Pred Value              0.4128
## Neg Pred Value              0.7500
## Prevalence                  0.3106
## Detection Rate              0.1536
## Detection Prevalence        0.3720
## Balanced Accuracy           0.5888

test <- as.data.frame(cm$table)

ggplot(test, aes(x = Reference, y = Freq, fill = Prediction)) +
  geom_bar(stat = "identity") +
  coord_flip()

The accuracy of the current ArcSIE model appears to be low, according to several metrics. The positive predictive value for the severe class is < 50%.

5.1 Boxplots

soil_vals <- c("topsoil_thk", "solum_dep", "CaCO3_dep", "A_clay", "Bt_clay", "A_m_value", "A_m_chroma")
soil_dif_vals <- c("clay_dif", "tex_dif", "dep_dif", "value_dif", "chroma_dif")
geo_vals1 <- c("SlopeSIE", "ProfCrv", "PlanCrv", "relpos", "wetness")
geo_vals2 <- c("slope5", "kt5", "kp5", "z2streams", "wetness2", "mvalley")

vals <- c(soil_vals, soil_dif_vals, geo_vals1, geo_vals2)
gw <- gw[complete.cases(gw[c("erocl", soil_vals)]), ]

gw_lo1 <- melt(gw, id.vars = c("erocl", "landform"), measure.vars = vals)
gw_lo2 <- melt(gw, id.vars = c("erocl_sie", "landform"), measure.vars = vals)

names(gw_lo1)[1] <- "EroClass"
gw_lo1$method <- "FD"
names(gw_lo2)[1] <- "EroClass"
gw_lo2$method <- "SIE"
gw_lo <- rbind(gw_lo1, gw_lo2)
gw_lo <- subset(gw_lo, !is.na(EroClass))

gw_lo <- na.exclude(gw_lo)
ggplot(gw_lo1, aes(x = EroClass, y = value, color = landform)) +
  geom_boxplot() +
  facet_wrap(~ paste(variable, method), scales="free", ncol = 4) +
  coord_flip()

An exploratory analysis shows a considerable amount of overlap exists between the field determined (FD) erosion classes and measurable soil properties. In comparison the FD and SIE (Soil Inference Engine) erosion classes show different patterns within the boxplots, further suggesting that the SIE classes aren’t capturing the field observations accurately. The most important feature to highlight is that the trends between the SIE classes and digital elevation model (DEM) derivatives (i.e. slope) don’t match those observed for the FD classes. This mismatch suggests that the membership functions for the SIE classes are a poor fit, and should be redefined to more accurately represent the relationship between the FD classes and DEM derivatives.

The difference between the ground and end moraines appears to be minimal, although slope threshold for end moraines seems to be consistently lower, suggesting an interaction between slope and landform.

5.2 Scatterplots

soil_vals2 <- c("topsoil_thk", "solum_dep", "CaCO3_dep", "A_clay", "Bt_clay") # excluded color, only observed a narrow range thus small differences swamp everthing else
vals <- c(soil_vals2)

test <- gw[, vals]
test_d <- daisy(scale(test), metric = "gower")
test_mds <- metaMDS(test_d, distance = "gower", autotransform = FALSE, trace = FALSE)
test_pts <- cbind(as.data.frame(test_mds$points), erocl = gw$erocl)

g1 <- ggplot(gw, aes(x = Bt_clay, y = topsoil_thk, color = erocl)) +
  geom_point(cex = 2, alpha = 0.75) +
  xlim(c(max(gw$Bt_clay), min(gw$Bt_clay))) +
  theme(aspect.ratio = 1)
g2 <- ggplot(test_pts, aes(x = MDS1, y = MDS2, color = erocl)) +
  geom_point(cex = 2, alpha = 0.75) +
  theme(aspect.ratio = 1)
grid.arrange(g1, g2, ncol = 2)

According to the scatterplot above it appears that only the severe and slight classes are separatable. The moderate erosion class seems to overlap the most with slight. The overlap in the FD classes is likely due to bias within and between the soil scientists who collected the data. Both the 15-feet and 10-meter DEM derivatives were evaluated, but the results are similar.

5.3 Classification Tree

test <- subset(gw, !is.na(erocl))

test_ct <- ctree(erocl ~ ., data = test[, c("erocl", soil_vals)])
plot(test_ct)

cm <- confusionMatrix(data = predict(test_ct, type = "response"), reference = test$erocl)
print(cm)

## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    undisturbed slight moderate severe
##   undisturbed          42      5        6      1
##   slight                8     67        8      1
##   moderate              1     13       58      3
##   severe                0      2       23    109
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7954          
##                  95% CI : (0.7491, 0.8366)
##     No Information Rate : 0.3285          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7199          
##  Mcnemar's Test P-Value : 0.001127        
## 
## Statistics by Class:
## 
##                      Class: undisturbed Class: slight Class: moderate
## Sensitivity                      0.8235        0.7701          0.6105
## Specificity                      0.9595        0.9346          0.9325
## Pos Pred Value                   0.7778        0.7976          0.7733
## Neg Pred Value                   0.9693        0.9240          0.8640
## Prevalence                       0.1470        0.2507          0.2738
## Detection Rate                   0.1210        0.1931          0.1671
## Detection Prevalence             0.1556        0.2421          0.2161
## Balanced Accuracy                0.8915        0.8524          0.7715
##                      Class: severe
## Sensitivity                 0.9561
## Specificity                 0.8927
## Pos Pred Value              0.8134
## Neg Pred Value              0.9765
## Prevalence                  0.3285
## Detection Rate              0.3141
## Detection Prevalence        0.3862
## Balanced Accuracy           0.9244

An analysis of the erocl above with a classification tree is an attempt to discern the hierachical structuce within the data. The results show topsoil thickness (topsoil_thk) and depth to CaCO3 (CaCO3_dep) are the first splits. The trees structure follows the logic described in the erosion indicators guide developed for this project, although more emphasis is given to CaCO3 than originally thought, which maybe an artifact of the wide range in CaCO3_dep relative to topsoil_thk. The overall accuracy for the tree is 0.8.

6 Hierachical Clusters

In order to see if more separation can be achieved amongst the erosion classes a hierachical classifition was peformed. Four unsupervised classes were chosen and manually matched to the FD classes.

test_c <- hclust(test_d, method = "ward.D")
plot(test_c, labels = gw$upedonid)
rect.hclust(test_c, k = 4)

clusters <- cbind(gw, 
                  test_pts[, 1:2], 
                  clusters = factor(cutree(test_c, k = 4),
                                    levels = c(2, 3, 1, 4),
                                    labels = ero_labels
                                    )
                  )
clusters <- cbind(gw, 
                  test_pts[, 1:2], 
                  clusters = factor(cutree(test_c, k = 4),
                                    levels = c(4, 3, 2, 1),
                                    labels = ero_labels
                                    )
                  )

xtabs(~ erocl + clusters, data = clusters)

##              clusters
## erocl         undisturbed slight moderate severe
##   undisturbed          32     16        3      0
##   slight               11     53       23      0
##   moderate              5     20       53     17
##   severe                0      2       40     72

6.1 Box Plots

gw_lo1 <- melt(gw, id.vars = "erocl", measure.vars = c(soil_vals, geo_vals2))
gw_lo3 <- melt(clusters, id.vars = "clusters", measure.vars = c(soil_vals, geo_vals2))

names(gw_lo1)[1] <- "EroClass"
names(gw_lo3)[1] <- "EroClass"
gw_lo1$method <- "FD"
gw_lo3$method <- "clusters"
gw_lo <- rbind(gw_lo1, gw_lo3)

ggplot(gw_lo, aes(x = EroClass, y = value)) +
  geom_boxplot() +
  facet_wrap(~ paste(variable, method), scales="free", ncol = 4) + 
  coord_flip()

A comparison of the FD and cluster classes shows that the clusters do a good job replicating the patterns found in the boxplots.

6.2 Scatter Plots

g1 <- ggplot(clusters, aes(x = MDS1, y = MDS2, col = erocl)) +
  geom_point(cex = 2, alpha = 0.75) +
  theme(aspect.ratio = 1)
g2 <- ggplot(clusters, aes(x = MDS1, y = MDS2, col = clusters), main = "test") +
  geom_point(cex = 2, alpha = 0.75) +
  theme(aspect.ratio = 1)
grid.arrange(g1, g2, ncol = 2)

In comparison the hierarchical clusters have less overlap when viewed along the multidimensional scaled axes, but do not seem to have greatly enhanced the separation of the moderate class. In comparison, the FD erocl appear to separate the classes into equally sized clusters, thus somewhat validating the field determinations (FD) of the soil scientists.

6.3 Classification Tree

test2 <- ctree(clusters ~ ., data = clusters[, c("clusters", soil_vals)])
plot(test2)

confusionMatrix(data = predict(test2, type = "response"), reference = clusters$clusters)

## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    undisturbed slight moderate severe
##   undisturbed          48      0        0      0
##   slight                0     82       16      0
##   moderate              0      6       97      7
##   severe                0      3        6     82
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8905          
##                  95% CI : (0.8528, 0.9213)
##     No Information Rate : 0.3429          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8502          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: undisturbed Class: slight Class: moderate
## Sensitivity                      1.0000        0.9011          0.8151
## Specificity                      1.0000        0.9375          0.9430
## Pos Pred Value                   1.0000        0.8367          0.8818
## Neg Pred Value                   1.0000        0.9639          0.9072
## Prevalence                       0.1383        0.2622          0.3429
## Detection Rate                   0.1383        0.2363          0.2795
## Detection Prevalence             0.1383        0.2824          0.3170
## Balanced Accuracy                1.0000        0.9193          0.8791
##                      Class: severe
## Sensitivity                 0.9213
## Specificity                 0.9651
## Pos Pred Value              0.9011
## Neg Pred Value              0.9727
## Prevalence                  0.2565
## Detection Rate              0.2363
## Detection Prevalence        0.2622
## Balanced Accuracy           0.9432

In comparision, the classification tree for the clusters splits primarily on the CaCO3_dep and solum_dep, presumable due to the narrow range in topsoil_thk.

7 Statistical Modeling

Below several statistical models were evaluated to see if a more accurate model could be developed.

7.1 FD Classes vs DEM Derivatives

test3 <- ctree(erocl ~ ., data = gw[, c("erocl", geo_vals2)])
plot(test3)

cm_ct <- confusionMatrix(data = predict(test3, type = "response"), reference = gw$erocl)
round(cm_ct$overall, 2)

##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##           0.41           0.22           0.36           0.46           0.33 
## AccuracyPValue  McnemarPValue 
##           0.00           0.00

test3 <- cforest(as.factor(erocl) ~ ., data = gw[, c("erocl", geo_vals2)])
varimp(test3)

##      slope5         kt5         kp5   z2streams    wetness2     mvalley 
## 0.035244094 0.029307087 0.006834646 0.002251969 0.015244094 0.004929134

cm_cf <-confusionMatrix(data = predict(test3, type = "response", OOB = TRUE), reference = gw$erocl)
round(cm_cf$overall, 2)

##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##           0.42           0.20           0.37           0.47           0.33 
## AccuracyPValue  McnemarPValue 
##           0.00           0.02

Neither a classification tree or forest were capiable of achieving a significantly higher accuracy than the SIE model.

7.2 Clusters vs DEM Derivatives

test4 <- ctree(clusters ~ ., data = clusters[, c("clusters", geo_vals2)])
plot(test4)

cm_ct <- confusionMatrix(data = predict(test4, type = "response"), reference = clusters$clusters)
round(cm_ct$overall, 2)

##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##           0.39           0.16           0.34           0.45           0.34 
## AccuracyPValue  McnemarPValue 
##           0.03           0.00

test4 <- cforest(clusters ~ ., data = clusters[, c("clusters", geo_vals2)])
varimp(test4)

##       slope5          kt5          kp5    z2streams     wetness2 
##  0.015196850  0.033354331 -0.004000000  0.000976378  0.004834646 
##      mvalley 
##  0.003716535

cm_cf <- confusionMatrix(data = predict(test4, type = "response", OOB=TRUE), reference = clusters$clusters)
round(cm_cf$overall, 2)

##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##           0.36           0.11           0.31           0.42           0.34 
## AccuracyPValue  McnemarPValue 
##           0.23           0.01