!manual add time zero to “days on hops” experiment!
Anton Paar data were collected at time zero for kinetics experiment (triplicate samples run immediately after sample collection, while the remainder were being dry-hopped), but were inadvertently excluded from the original dataset. Here they are: Date time sample no. ABV ABW OE (P) Er Ea SG 20/20 RDF ADF sample_id 07/27/17 3:26 PM 03_ 6.68 5.2 15.95 6.08 3.74 1.01462 63.88 76.58 JK1-20-41 07/27/17 3:30 PM 04_ 6.68 5.2 15.95 6.08 3.73 1.01461 63.89 76.6 JK1-20-50 07/27/17 3:34 PM 05_ 6.68 5.2 15.95 6.08 3.74 1.01462 63.88 76.58 JK1-20-87
## fill in MISSING DATA :( add time zero data for time series analysis)
## nested ifelse to add time zero plato values for each expt:
mydata<- mydata %>% mutate(., initial_plato = ifelse(expt==" 2A", 3.74, ifelse(expt==" 1A", 3.69, ifelse(expt==" 1B", 3.53, ifelse(expt==" 3B", 3.54, ifelse(expt==" 4B", 4.66, NA))))))
`as_dictionary()` is soft-deprecated as of rlang 0.3.0.
Please use `as_data_pronoun()` instead
[90mThis warning is displayed once per session.[39m`new_overscope()` is soft-deprecated as of rlang 0.2.0.
Please use `new_data_mask()` instead
[90mThis warning is displayed once per session.[39mThe `parent` argument of `new_data_mask()` is deprecated.
The parent of the data mask is determined from either:
* The `env` argument of `eval_tidy()`
* Quosure environments when applicable
[90mThis warning is displayed once per session.[39m`overscope_clean()` is soft-deprecated as of rlang 0.2.0.
[90mThis warning is displayed once per session.[39m
## add complete time-zero anton paar data for time series (expt2A)
## quadruple-up each triplicate time-zero measurement (one copy for each NH/DH & rouse/still combo)
## note: this practice is of questionable statistical rigor. In retrospect it would have been better (and a lot easier) to just run 12 samples through Anton on day0!
timezero <- mydata %>% filter(expt==" 2A") %>% arrange(special_group) %>% slice(1:12)
#data.entry(timezero)
timezero$Test_Date <- rep(c("7/27/2017","7/27/2017","7/27/2017"),4)
timezero$Test.time <- rep(c("3:26 PM", "3:30 PM", "3:34 PM"),4)
timezero$ABV <- 6.68
timezero$ABW <- 5.2
timezero$OE <- 15.95
timezero$Er <- 6.08
timezero$Ea <- rep(c(3.74, 3.73, 3.74),4)
timezero$SG <- rep(c(1.01462, 1.01461, 1.01462),4)
timezero$RDF <- rep(c(63.88, 63.89, 63.88),4)
timezero$ADF <- rep(c(76.58, 76.60, 76.58),4)
timezero$Calories <- rep(c(215.37,215.34,215.35),4)
timezero[timezero$hop=="DH",]$contact_days <-0
timezero[,c(28:31)] <- 0
mydata <- rbind(timezero,mydata)
write.csv(mydata, "quickie.csv", row.names = FALSE)
mydata<-read.csv("quickie.csv", stringsAsFactors = FALSE)
### this is a routine for inspecting data ###
mydatatypes<- as.data.frame(cbind(sapply(mydata, class))) ## table of datatypes
mydatatypes$headers <- rownames(mydatatypes) ## convert rownames to values
na_count <-as.data.frame(sapply(mydata, function(y) sum(length(which(is.na(y)))))) ## count NAs by column
mydataOverview<-as.data.frame(cbind(na_count,mydatatypes)) ## table of NA counts and datatypes
names(mydataOverview) <- c("NA_count","Data_Class", "Header")
#sum(is.na(mydata)) ## total count of NA values in entire sheet (231 in this case)
#mydataOverview %>% filter(NA_count>0) ## breakdown of NA values
tidy & transform
note: this is a base R +dplyr data wrangling exercise! in general it’s best to use lubridate package for date/timestamps!
- create a date column. Here we create new date columns in POSIXct format, based on the particular date format used in the source data (ujbc_a_1469081_sm5496.txt; see ?as.POSIXct). First an example of how NOT to create new variables in our dataframe:
test_df<- mydata # (original dataframe to be used for testing/illustration)
x_brew_date.POSIXct<- as.POSIXct(mydata$brew_date, format = "%m/%d/%Y") # as a standalone vector (USELESS here), or:
test_df$brew_date.POSIXct <-as.POSIXct(mydata$brew_date, format = "%m/%d/%Y") # as a "new column" in our dataframe
- ADVANCED data wrangling:
- create multiple date columns using FORLOOP (here we take advantage of the pattern that of our (character) date/time columns have the string “date” in the headername. First make character vector of all column names containing string “date”, then run FORLOOP over each date column). This forloop will convert datecols into POSIXct format (R/universal datetime format). They say it’s best to avoid forloops in general, and in this case you would probably use functions from lubridate package - and in general be aware there’s probably a specific tool for the task at hand)!!
datecols<- dput(names(dplyr::select(mydata, matches("date")))) ## headers containing string "date" => c("brew_date", "sample_collection_date", "dryhop_date", "Test_Date")
c("brew_date", "sample_collection_date", "dryhop_date", "Test_Date"
)
## FORLOOP
for (icol in datecols) {
newcol = paste0(icol,".POSIXct")
# print(newcol)
mydata[, newcol] = as.POSIXct(mydata[, icol],format = "%m/%d/%Y") ### CREATE NEW columns with POSIXct
# mydata[, newcol] = as.POSIXct(as.numeric(mydata[, icol]) * (60*60*24), origin="1899-12-30") ### microsoft times
}
- create dayofaddition, daysonhops, hops_g_100mL, pounds_bbl variables with dplyr “mutate”
mydata<- mydata %>%
mutate(dayofaddition = as.integer(as.numeric(difftime(dryhop_date.POSIXct,brew_date.POSIXct))),
daysonhops = as.integer(as.numeric(difftime(Test_Date.POSIXct,dryhop_date.POSIXct))/(60*60*24)),
hops_g_100mL = (mg_hops/volume_mL)/10,
pounds_bbl = (mg_hops/volume_mL)*117/454
)
- ADVANCED data wrangling: manually unpack overloaded columns with grepl
mydata$OX<-grepl("OX", mydata$Hop_type) ## create logical "OX" column
mydata$rouse<-grepl("rouse", mydata$special_group)## create logical column
mydata$Grind<-grepl("Grind", mydata$Hop_type) ## create logical column
mydata$Cone<-grepl("Cone", mydata$Hop_type) ## create logical column
mydata$harvest2014<-grepl("14", mydata$Hop_type) ## create logical column
mydata$harvest2015<-grepl("15", mydata$Hop_type) ## create logical column
mydata$harvest2017<-grepl("17", mydata$Hop_type) ## create logical column
- ADVANCED data wrangling: ifelse statements for harvestyear, form_of_hops, and DH_temp
## ifelse statement for harvestyear (if neither 2014 nor 2015 nor 2017, then 2016)
mydata$harvestYear <- ifelse(
mydata$harvest2014==TRUE, 2014,
ifelse(mydata$harvest2015==TRUE, 2015,
ifelse(mydata$harvest2017==TRUE, 2017, 2016)))
dput(levels(as.factor(mydata$harvestYear)))
c("2014", "2015", "2016", "2017")
## ifelse statement for form of hops (if neither cone nor ground nor NH, then pellet)
mydata$form_of_hops <- ifelse(
mydata$Cone==TRUE, "cone",
ifelse(mydata$Grind==TRUE, "ground",
ifelse(mydata$Hop_type=="NH", "NH", "pellet")))
dput(levels(as.factor(mydata$form_of_hops)))
c("cone", "ground", "NH", "pellet")
## ifelse statement for temperature greater or less than 10
mydata$DH_temp <- ifelse(
mydata$temp.C<10, "cold",
ifelse(mydata$temp.C>10, "warm", "something else"))
dput(levels(as.factor(mydata$DH_temp)))
c("cold", "warm")
- ADVANCED data wrangling: create “variety” column starting with overloaded “Hop_type” column, then stripping away all the non-variety information
mydata$variety<-mydata$Hop_type
mydata$variety<- gsub("[0-9]+","", mydata$variety) ## remove all numbers
mydata$variety<- gsub("OX","", mydata$variety) ## remove specific text
mydata$variety<- gsub("Grind","", mydata$variety)
mydata$variety<- gsub("Cone","", mydata$variety)
mydata$variety<- gsub(" ","", mydata$variety) ## remove spaces
dput(levels(as.factor(mydata$variety)))
c("AMAR", "CASC", "CENT", "CIT", "NH", "SIM")
- ADVANCED data wrangling: create “EXPTnew” (each unique experimental group designated in a single variable)
mydata<- mydata %>% mutate(EXPTnew=paste0("group",expt, substr(special_group, 1,2),as.character(rouse)))
# clean it up by removing "NA" and any spaces due to canarycode bug
mydata$EXPTnew <- gsub(" ","", mydata$EXPTnew) ## remove any spaces
mydata$EXPTnew <- gsub("NA","", mydata$EXPTnew) ## remove "NA"
dput(levels(as.factor(mydata$expt)))
c(" 1A", " 1B", " 2A", " 3A", " 3B", " 4B")
dput(levels(as.factor(mydata$EXPTnew)))
c("group1AFALSE", "group1BFALSE", "group2A01FALSE", "group2A01TRUE",
"group2A05FALSE", "group2A05TRUE", "group2A12FALSE", "group2A12TRUE",
"group2A19FALSE", "group2A19TRUE", "group2A25FALSE", "group2A25TRUE",
"group2A33FALSE", "group2A33TRUE", "group2A42FALSE", "group2A42TRUE",
"group3AFALSE", "group3BFALSE", "group4BFALSE")
select and rearrange columns
(experimental factors on the left, then measurements, followed by calculations and then all date columns on the right):
mydata <- mydata %>%
dplyr::select(sample_id,expt, EXPTnew, hop, BINhop, variety, OX, harvestYear, form_of_hops,special_conditions, rouse, daysonhops, dayofaddition, DH_temp, temp.C, hops_g_100mL, pounds_bbl,
ABV, ABW, OE, Er, Ea, SG, RDF, ADF, Calories,
dhop_day, contact_days, REF_NH, ABV_increase,
brew_date.POSIXct, sample_collection_date.POSIXct, dryhop_date.POSIXct,Test_Date.POSIXct, initial_plato)
## remove ".POSIXct" suffix. Leaving it as-is will only add to confusion if/when these data are saved and re-imported (and become 'character' format again!)
colnames(mydata) = gsub(".POSIXct", "", colnames(mydata))
compute mean NH (control) values
(NH = not dry-hopped)
## mean NH (control) values for each EXPTnew group
mean.control_NH<- mydata %>%
group_by(EXPTnew) %>%
filter(hop=="NH") %>%
summarise_at(vars(ABV, ABW, Ea, SG),funs(mean, n()))
## replace "_mean" with ".control_NH" in column names
colnames(mean.control_NH) = gsub("_mean", ".control_NH", colnames(mean.control_NH))
compute \(\Delta\) values (differences relative to NH controls) using objects created above…
- difference of each ABV,ABW,Ea, and SG data point from corresponding mean.control_NH values (unhopped samples in same EXPTnew group)
## first join our data with mean ABV for unhopped samples in given experiment (mean.control_NH; calculated above)
FPHcalc<- left_join(mydata, mean.control_NH, by="EXPTnew")
## calculate ABW_increase by subtracting each individual ABW measurement from respective mean.control_NH:
FPHcalc$delta.ABV <- FPHcalc$ABV - FPHcalc$ABV.control_NH
FPHcalc$delta.ABW <- FPHcalc$ABW - FPHcalc$ABW.control_NH
FPHcalc$delta.plato <- FPHcalc$Ea - FPHcalc$Ea.control_NH
FPHcalc$delta.SG <- FPHcalc$SG - FPHcalc$SG.control_NH
## the control samples have served their purpose, now remove them from dataset. The following calculations are only meaningful for dry-hopped samples.
FPHcalc<- FPHcalc %>% filter(hop=="DH")
calculate corresponding CO2 production
- following Bamforth (describing Balling equation) “…more realistically, the ethanol yield is more like 0.46 g and carbon dioxide 0.44 g from 1 g sugar” (p. 137 in Brewing Materials and Processes: A Practical Approach to Beer Excellence, Edited by Charles Bamforth Academic Press, 2016)
FPHcalc$calcCO2_increase <- FPHcalc$delta.ABW*(0.44/0.46)
##convert calcCO2_increase (in g/100mL) to calculated CO2 volumes added
## g/L = 10* g/100mL
## The conversion factor from volumes of CO2 to CO2 by weight (g/L) is 1.96. For example: 2.5 volumes x 1.96 = 4.9 g/l.
FPHcalc$calcCO2vols_increase <- FPHcalc$calcCO2_increase*10/1.96
define “FPH” as amount produced per amount dry-hops added (all in g/100mL):
## FPH = Fold Production due to Hops (fold-increase by mass: amount of given endpoint relative to amount of hops added)
##
FPHcalc$FPH_EtOH = FPHcalc$delta.ABW/FPHcalc$hops_g_100mL
FPHcalc$FPH_CO2 = FPHcalc$calcCO2_increase/FPHcalc$hops_g_100mL
FPHcalc$FPH_plato = FPHcalc$delta.plato/FPHcalc$hops_g_100mL
## replacing time-zero time values with a very small number (rather than exactly zero) will prevent issues with analysis of nonlinear models
FPHcalc[FPHcalc$daysonhops==0,]$daysonhops <- 0.01
## and save the transformed data to csv:
write.csv(FPHcalc,"FPHcalc.csv", row.names = FALSE)
FPHcalc <- read.csv("FPHcalc.csv", stringsAsFactors = TRUE)
df<- FPHcalc %>% group_by(EXPTnew) %>%
summarise_at(vars(hops_g_100mL,pounds_bbl,delta.ABV, delta.ABW, delta.plato, FPH_plato, FPH_EtOH, FPH_CO2, calcCO2vols_increase),funs(round(mean(.), 2))) %>%
arrange(desc(FPH_EtOH))
df
visualize
vector~vector*vector plots
df <-FPHcalc
p1<- ggplot(df, aes(y=FPH_EtOH,x=OE, color=contact_days)) + geom_point(size=2)
p2<- ggplot(df, aes(y=FPH_EtOH,x=ADF, color=contact_days)) + geom_point(size=2)
p3<- ggplot(df, aes(y=FPH_EtOH,x=ABW, color=contact_days)) + geom_point(size=2)
p4<- ggplot(df, aes(y=FPH_EtOH,x=Ea, color=contact_days)) + geom_point(size=2)
grid.arrange(p1, p2, p3, p4, ncol = 2)

x~y*(4 vectors) by color
df <- FPHcalc
x<- df$Ea
y<- df$ABW #FPH_EtOH
p1<- ggplot(df, aes(x,y, color=form_of_hops)) + geom_point(size=2)
p2<- ggplot(df, aes(x,y, color=brew_date)) + geom_point(size=2)
p3<- ggplot(df, aes(x,y, color=rouse)) + geom_point(size=2)
p4<- ggplot(df, aes(x,y, color=pounds_bbl)) + geom_point(size=2)
grid.arrange(p1, p2, p3, p4, ncol = 2)

3D plots for html with library(plotly)
see https://plotly-r.com/the-plotly-cookbook.html for overview and some visualization inspiration.
# interactive 3D plots with library(plotly)
df <- FPHcalc # %>% filter(expt==" 2A")
#plot_ly(df, x = ~daysonhops, y = ~ABW, z = ~Ea) %>% add_markers(color = ~expt)
Amunategui “Using Correlations To Understand Your Data”
mydata<-FPHcalc %>% dplyr::select(expt,variety,form_of_hops,pounds_bbl,special_conditions,rouse,daysonhops,DH_temp,brew_date,ABW,SG,OE,ADF,RDF,calcCO2vols_increase,FPH_CO2, FPH_EtOH) ## last one is 100% in j
## note use of "dplyr::select" because one of these packages is conflicting with dplyr commands :()
## following Manuel Amunategui https://www.youtube.com/watch?v=igPQ-pI8Bjo
## Using Correlations To Understand Your Data: Machine Learning With R
##functions for flattenSquareMatrix
cor.prob <- function (X, dfr=nrow(X) -2) {
R<- cor(X, use="pairwise.complete.obs")
above<- row(R) < col(R)
r2 <- R[above]^2
Fstat<- r2 * dfr/(1-r2)
R[above] <- 1- pf(Fstat, 1, dfr)
R[row(R) == col(R)] <- NA
R
}
flattenSquareMatrix <- function(m) {
if( (class(m) != "matrix") | (nrow(m)!=ncol(m))) stop("Must be a square matrix.")
if(!identical(rownames(m), colnames(m))) stop("Row and column names must be equal.")
ut <- upper.tri(m)
data.frame(i = rownames(m)[row(m)[ut]],
j = rownames(m)[col(m)[ut]],
cor=t(m)[ut],
p=m[ut])
}
## library(caret) to dummify everything (turn all characters&factors into columns; ignores numbers and integers)
dmy<- dummyVars(" ~ .",data = mydata)
mydummifieddata<- data.frame(predict(dmy, newdata = mydata))
corMat = cor(mydummifieddata)
corMasterList<- flattenSquareMatrix(cor.prob(mydummifieddata)) ## list of all correlations
## order by strength of correlation
corlist<- corMasterList %>% arrange(-abs(corMasterList$cor))
write.csv(corlist,paste0("FLAT correlation matrix_.csv"))
corlist <- corlist %>% dplyr::filter(j=="FPH_EtOH") ## filter specific endpoint
head(corlist,50)
pairs.panels correlation matrix from library(psych)
## specify interesting variables:
interestingvariables<-c("ABW", "pounds_bbl", "daysonhops", "calcCO2vols_increase")
pairs.panels(mydummifieddata[c(interestingvariables, "FPH_EtOH")])

model
Observing the impacts of dryhopping in the presence of live yeast has led many brewing professionals to understand that FPH is a function of many of the variables above including hop variety,form_of_hops,harvestYear,OX,DH_temp,daysonhops,rouse,pounds_bbl…. Many have intuitively created a model in their heads (without necessarily thinking of it as such) and skillfully adjust process when necessary to account for this phenomenon. In linear modeling, our function will take on the form: \(FPH = intercept + \beta_{1}X_{1} + \beta_{2}X_{2} + ... + \beta_{n}X_{n}\) where \(\beta\) values are what we’re attempting to derive in this modeling exercise, and X values are (collectively) a particular set of conditions.
ActionItem: add link to modeling overview
linear models 1 (create models)
df<-FPHcalc
lm1<-lm(df$SG~df$OE)
lm2<-lm(df$SG~df$ADF)
lm3<-lm(df$SG~df$ABW)
lm4<-lm(df$SG~df$Ea)
par(mfrow = c(2, 2), oma = c(0, 0, 0, 0))
plot(df$SG~df$OE)
abline(lm1)
plot(df$SG~df$ADF)
abline(lm2)
plot(df$SG~df$ABW)
abline(lm3)
plot(df$SG~df$Ea)
abline(lm4)

linear models 2 (view residual plots)
df<-FPHcalc
lm1<-lm(df$SG~df$OE)
lm2<-lm(df$SG~df$ADF)
lm3<-lm(df$SG~df$ABW)
lm4<-lm(df$SG~df$Ea)
par(mfrow = c(2, 2), oma = c(0, 0, 0, 0))
plot(lm1$residuals)
plot(lm2$residuals)
plot(lm3$residuals)
plot(lm4$residuals)

#summary(lm4)
compare linear models using memisc package
df<-FPHcalc
lm1<-lm(df$SG~df$OE)
lm2<-lm(df$SG~df$ADF)
lm3<-lm(df$SG~df$ABW)
lm4<-lm(df$SG~df$Ea)
mtable1234 <- mtable("Model 1"=lm1,"Model 2"=lm2,"Model 3"=lm3, "Model 4"=lm4,
summary.stats=c("sigma","R-squared","F","p","N"),show.eqnames=T)
mtable1234b <- relabel(mtable1234,
"(Intercept)" = "Constant",
x1 = "OE = Original Extract (g/100mL)",
x2 = "ADF = Apparent Degree of Fermentation (%)",
x3 = "ABW = Ethanol (w/w)",
x4 = "Er = Residual Extract (g/100mL)"
)
mtable1234
Calls:
Model 1: lm(formula = df$SG ~ df$OE)
Model 2: lm(formula = df$SG ~ df$ADF)
Model 3: lm(formula = df$SG ~ df$ABW)
Model 4: lm(formula = df$SG ~ df$Ea)
======================================================================
Model 1 Model 2 Model 3 Model 4
----------- ------------- ------------ ---------------
df$SG df$SG df$SG df$SG
----------------------------------------------------------------------
(Intercept) 1.071*** 1.063*** 1.056*** 1.000***
(0.020) (0.000) (0.001) (0.000)
df$OE -0.004**
(0.001)
df$ADF -0.001***
(0.000)
df$ABW -0.008***
(0.000)
df$Ea 0.004***
(0.000)
----------------------------------------------------------------------
sigma 0.002 0.000 0.001 0.000
R-squared 0.066 0.997 0.934 1.000
F 8.553 48038.147 1700.301 3532408.297
p 0.004 0.000 0.000 0.000
N 123 123 123 123
======================================================================
#show_html(mtable1234b)
linear models of FPH
df <- FPHcalc
lm1<-lm(FPH_EtOH~daysonhops, data=df)
lm2<-lm(FPH_EtOH~daysonhops*form_of_hops, data=df)
lm3<-lm(FPH_EtOH~daysonhops*rouse, data=df)
lm4<-lm(FPH_EtOH~daysonhops*form_of_hops*rouse, data=df)
mtable1234 <- mtable("Model 1"=lm1,"Model 2"=lm2,"Model 3"=lm3, "Model 4"=lm4,
summary.stats=c("sigma","R-squared","F","p","N"),show.eqnames=T)
mtable1234b <- relabel(mtable1234,
"(Intercept)" = "Constant",
SG = "Specific Gravity",
ABW = "ABW = Ethanol (w/w)",
Er = "Er = Residual Extract (g/100mL)"
)
mtable1234
Calls:
Model 1: lm(formula = FPH_EtOH ~ daysonhops, data = df)
Model 2: lm(formula = FPH_EtOH ~ daysonhops * form_of_hops, data = df)
Model 3: lm(formula = FPH_EtOH ~ daysonhops * rouse, data = df)
Model 4: lm(formula = FPH_EtOH ~ daysonhops * form_of_hops * rouse, data = df)
=============================================================================
Model 1 Model 2 Model 3 Model 4
----------- ----------- ----------- -----------
FPH_EtOH FPH_EtOH FPH_EtOH FPH_EtOH
-----------------------------------------------------------------------------
(Intercept) 0.498*** 0.306 0.537*** 0.319
(0.044) (0.197) (0.049) (0.197)
daysonhops 0.044*** 0.044*** 0.041*** 0.041***
(0.003) (0.003) (0.004) (0.004)
form_of_hops: ground/cone 0.849** 0.849**
(0.278) (0.277)
form_of_hops: pellet/cone 0.174 0.199
(0.200) (0.199)
rouse -0.227 -0.208
(0.122) (0.118)
daysonhops x rouseTRUE 0.010 0.009
(0.006) (0.006)
-----------------------------------------------------------------------------
sigma 0.355 0.340 0.352 0.339
R-squared 0.634 0.669 0.645 0.678
F 210.023 80.145 72.038 49.193
p 0.000 0.000 0.000 0.000
N 123 123 123 123
=============================================================================
#show_html(mtable1234b)
The R Book (Crawley) Table 20.1: nonlinear functions useful in biology
Table 20.1. Useful non-linear functions EXPANDED:
Asymptotic functions |
Michaelis–Menten |
\(y =\frac{ax}{1+bx}\) |
nls(bone~aage/(1+bage),start=list(a=8,b=0.08))) |
enzyme reactions |
|
|
|
nls(rate~SSmicmen(conc,a,b)) |
tbd |
|
2-parameter asymptotic exponential |
\(y = a(1 − e^{−bx} )\) |
nls(bone~a(1-exp(-cage)),start=list(a=120,c=0.064)) |
tbd |
|
3-parameter asymptotic exponential |
\(y = a − be^{−cx}\) |
nls(bone~a-bexp(-cage),start=list(a=120,b=110,c=0.064)) |
tbd |
|
|
|
nls(bone~SSasymp(age,a,b,c)) |
tbd |
|
|
|
nls(density ~ SSlogis(log(concentration), a, b, c)) |
tbd |
S-shaped functions |
2-parameter logistic |
\(y = \frac{e^{a+bx}}{1 + e^{a+bx}}\) |
|
tbd |
|
3-parameter logistic |
\(y = \frac{a}{1 + be^{−cx}}\) |
|
tbd |
|
4-parameter logistic |
\(y = a + \frac{b-a}{1 + e^{(c−x)/d}}\) |
nls(weight~SSfpl(Time, a, b, c, d)) |
tbd |
|
Weibull |
\(y = a − be^{−(cx^d)}\) |
nls(weight ~ SSweibull(time, Asym, Drop, lrc, pwr)) |
tbd |
|
Gompertz |
\(y = ae^{−be^{−cx}}\) |
|
tbd |
Humped curves |
Ricker curve |
\(y = axe^{−bx}\) |
|
tbd |
|
First-order compartment |
\(y = k exp(−exp(a)x) − exp(−exp(b)x)\) |
nls(conc~SSfol(Dose, Time, a, b, c)) |
tbd |
|
Bell-shaped |
\(y = a exp(−ABS(bx)^2)\) |
|
tbd |
|
Biexponential |
\(y = ae^{bx} − ce^{−dx}\) |
|
tbd |
base R nls function for Michaelis-Menton model
\(y =\frac{ax}{1+bx}\)
expt2 <- FPHcalc %>% dplyr::filter(expt==" 2A") %>% dplyr::select(FPH_EtOH, rouse,special_conditions, daysonhops,brew_date, pounds_bbl,form_of_hops,ABW.control_NH,dayofaddition)
myfactor<- as.factor(expt2$special_conditions) #rouse
cols <- as.numeric(myfactor)
legend.cols <- as.numeric(as.factor(levels(myfactor)))
my.MM.model <- nls(FPH_EtOH~a*daysonhops/(1+b*daysonhops),myfactor, data=expt2,start=list(a=8, b=0.08))
summary(my.MM.model)
Formula: FPH_EtOH ~ a * daysonhops/(1 + b * daysonhops)
Parameters:
Estimate Std. Error t value Pr(>|t|)
a 0.186159 0.010109 18.41 < 2e-16 ***
b 0.067820 0.005728 11.84 1.45e-15 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.09307 on 46 degrees of freedom
Number of iterations to convergence: 6
Achieved convergence tolerance: 1.686e-06
So using the Michaelis-Menten equation as our model, the relationship between hop-induced ethanol production (FPH_EtOH) and hop contact time (daysonhops) would be expressed as: \(FPH_EtOH = \frac{0.183572*daysonhops}{1+0.066388*daysonhops}\) with a Residual standard error of 0.09307. As detailed below, logistic models do a better job than this, and using library(drc) makes it easier to introduce a multilevel factor.
predict and plot model
x <- seq(0,50,0.1)
yv <- predict(my.MM.model, list(daysonhops=x))
{plot(expt2$daysonhops, expt2$FPH_EtOH, pch=21, col="purple", bg="green", main = "Michaelis-Menten model of ethanol production induced by dry-hopping")
lines(x,yv,col="blue")}

Speers2003: Non‐Linear Modelling of Industrial Brewing Fermentations
R. Alex Speers, Peter Rogers, Bruce Smith J. Inst. Brew. 109(3), 229–235, 2003 https://doi.org/10.1002/j.2050-0416.2003.tb00163.x Free Access from the Institute of Brewing! https://onlinelibrary.wiley.com/doi/10.1002/j.2050-0416.2003.tb00163.x
Following Speers2003, the relationship between plato and time in primary fermentations is well modeled by a four parameter logistic model:
PLATO = PINF + (P_D / (1+ (EXP(-B*(TIME-M)))))
or
\(PLATO = P_{inf}+ \frac{P_D}{1+e^{-B(t-M)}}\)
where
- t = time into fermentation
- P_D = change in Plato during fermentation (OE - P_{inf})
- B ~ maximum fermentation rate
- M = fermenation midpoint
- P_inf = final gravity
so then for our case
\(PLATO = P_{inf}+ \frac{OE-P_{inf}}{1+e^{-F_{max}(daysonhops-M)}}\)
PLATO = PINF + (P0 / (1+ (EXP(B*(HOURS-M)))))
MacIntosh2016: An Examination of Substrate and Product Kinetics During Brewing Fermentations
Andrew J. MacIntosh, Maria Josey, R. Alex Speers J. Am. Soc. Brew. Chem. 74(4), 250-257, 2016
“is only statistically advantageous to apply the five-parameter model when many data points are available… (72 from this experiment as opposed to 10 when using Yeast-14).”
MacIntosh five-parameter logistic: \(P_{(t)} =\frac{P_i - P_e}{(1+s * e^{-B(t-M)})^{1/s}}\)
library(drc) getMeanFunctions()
- drc LL.4: \(f(x) = c + \frac{d-c}{1+\exp(b(\log(x)-\tilde{e}))}\) #4param logistic
- drc LL.5: \(f(x) = c + \frac{d-c}{(1+\exp(b(\log(x)-e)))^f}\) #5param logistic
- drc G.4 \(f(x) = c + (d-c)(\exp(-\exp(b(x-e))))\) #4param Gompertz
- drc W1.4 \(f(x) = c + (d-c) \exp(-\exp(b(\log(x)-\log(e))))\) #4param Weibull1
- drc W2.4 \(f(x) = c + (d-c) (1 - \exp(-\exp(b(\log(x)-\log(e)))))\) #4param Weibull2
explore nonlinear models of hop-induced Plato drop over time with library(drc)


{plot(x , y, col=cols, main="raw data")
legend("topright", legend=levels(group), pch=16, col=legend.cols)}
#plot(m.LL.3, type='all',col=cols, main="LL.3 (lower limit at 0)")
plot(m.LL.4, type='all',col=cols, main="LL.4 four-parameter log-logistic")
plot(m.L.4, type='all',col=cols, main="L.4")

plot(m.LL.5, type='all',col=cols, main="LL.5")

plot(m.LL2.4, type='all',col=cols, main="LL2.4 four-parameter log-logistic")

plot(m.LL2.5, type='all',col=cols, main="Generalised log-logistic")

plot(m.W1.4, type='all',col=cols, main="Weibull1")

plot(m.W2.4, type='all',col=cols, main="Weibull2")

plot(m.BC.5, type='all',col=cols, main="Brain-Cousens (hormesis)")

plot(m.AR.3, type='all',col=cols, main="Shifted asymptotic regression")

#plot(m.MM.2, type='all',col=cols, main="2-parameter Michaelis-Menten")
plot(m.MM.3, type='all',col=cols, main="3-parameter Michaelis-Menten")

multilevel logistic models with library(drc)
df <- FPHcalc %>% filter(expt==" 2A")
df$PLATO.hopdrop <- df$initial_plato + df$delta.plato
x <- df$daysonhops
y <- df$PLATO.hopdrop
myfactor<- as.factor(df$special_conditions) #rouse
cols <- as.numeric(myfactor)
legend.cols <- as.numeric(as.factor(levels(myfactor)))
## create models
#m.LL.4<-drm(y ~x,myfactor, fct = LL.4(names = c("Slope", "Lower", "Upper", "Midpoint or ED50"))) #4-parameter log-logistic (general parameters)
#m.LL.4<-drm(y ~x,myfactor, fct = LL.4(names = c("Slope", "Lower", "Upper", "ED50"))) #4-parameter log-logistic (Dose-Response parameters)
m.LL.4<-drm(y ~x,myfactor, fct = LL.4(names = c("F_max", "P_inf", "OE", "M"))) #4-parameter log-logistic
m.L.4 <-drm(y ~x,myfactor, fct = L.4()) #changing the fct = LL.4() to fct = L.4() allows plotting on a log10 scale
m.LL2.4<-drm(y ~x,myfactor, fct = LL2.4()) #4-parameter log-logistic with log(e) rather than e as a parameter
m.LL2.5<-drm(y ~x,myfactor, fct = LL2.5()) #Generalised log-logistic
m.LL.5<-drm(y ~x,myfactor, fct = LL.5()) #5-parameter logistic
#plot a few side by side
par(mfrow = c(1, 2), oma = c(0, 0, 0, 0))
plot(m.LL.4,log="", broken = TRUE, bcontrol = list(style = "slash"), col = c(2,6,3,23,56), main = "4-parameter logistic")
plot(m.LL2.5,log="", broken = TRUE, bcontrol = list(style = "slash"), col = c(2,6,3,23,56), main = "5-parameter logistic")

plot residuals of nonlinear models generated with library(drc)
par(mfrow = c(3, 2), oma = c(0, 0, 0, 0))
{plot(m.MM.3$predres, main = "Michaelis-Menten")
abline(0,0)}
{plot(m.W2.4$predres, main = "Weibull2")
abline(0,0)}
{plot(m.LL.4$predres, main = "4-parameter logistic")
abline(0,0)}
{plot(m.LL.5$predres, main = "5-parameter logistic")
abline(0,0)}
{plot(m.BC.5$predres, main = "Brain-Cousens (hormesis)")
abline(0,0)}
{plot(m.LL2.5$predres, main = "Generalised log-logistic")
abline(0,0)}

from ??BC.5: The model function for the Brain-Cousens model (Brain and Cousens, 1989) is \(f(x, b,c,d,e,f) = c + \frac{d-c+fx}{1+\exp(b(\log(x)-\log(e)))}\) (doesn’t look any better than LL.4, not sure why I’m even looking…)
#pick one model to be mymodel
mymodel <- m.LL.4
mymodel
A 'drc' model.
Call:
drm(formula = y ~ x, curveid = myfactor, fct = LL.4(names = c("F_max", "P_inf", "OE", "M")))
Coefficients:
F_max:rouse F_max:still P_inf:rouse P_inf:still OE:rouse OE:still
1.392 1.800 1.809 2.149 3.749 3.751
M:rouse M:still
12.274 7.958
confint(mymodel)
2.5 % 97.5 %
F_max:rouse 1.186734 1.597836
F_max:still 1.537194 2.063586
P_inf:rouse 1.653949 1.963275
P_inf:still 2.079132 2.219179
OE:rouse 3.710521 3.787753
OE:still 3.714969 3.786819
M:rouse 10.616221 13.931883
M:still 7.316189 8.600129
summary(mymodel)
Model fitted: Log-logistic (ED50 as parameter) (4 parms)
Parameter estimates:
Estimate Std. Error t-value p-value
F_max:rouse 1.392285 0.101704 13.690 < 2.2e-16 ***
F_max:still 1.800390 0.130226 13.825 < 2.2e-16 ***
P_inf:rouse 1.808612 0.076525 23.634 < 2.2e-16 ***
P_inf:still 2.149156 0.034647 62.030 < 2.2e-16 ***
OE:rouse 3.749137 0.019107 196.219 < 2.2e-16 ***
OE:still 3.750894 0.017775 211.018 < 2.2e-16 ***
M:rouse 12.274052 0.820272 14.963 < 2.2e-16 ***
M:still 7.958159 0.317638 25.054 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error:
0.03875737 (40 degrees of freedom)
Each level (rouse and still) has a unique curve shape thereby resulting in different parameters for a best-fit log-logistic model. Now that we know this, it is practical to address each set of conditions separately. Based on the 4-parameter logistic model LL.4 we have for roused samples:
\(PLATO_{roused} = P_{inf}+ \frac{OE-P_{inf}}{1+e^{-F_{max}(daysonhops-M)}}\)
or
\(PLATO_{roused} = 1.809 + \frac{1.94}{1+e^{-1.3923(daysonhops-12.274)}}\)
and for untouched (still) samples:
\(PLATO_{still} = 2.149 + \frac{1.60}{1+e^{-1.8004(daysonhops-7.958)}}\)
Plotting and analysis options for multilevel models are limited. By filtering the data to focus on one set of conditions at a time we create ‘simple’ y~x logistic models. Many wonderful visualization tools are available for such x~y models. Here’s just a few:
single level logistic model with confidence intervals
(specify eval=false in R notebook codechunk header {r eval=FALSE} to prevent multiple messages regarding “Recycling array of length 1 in array-vector arithmetic is deprecated” from appearing in html)
df <- FPHcalc %>% filter(expt==" 2A" & rouse==TRUE)
df$PLATO.hopdrop <- df$initial_plato + df$delta.plato
x <- as.numeric(df$daysonhops)
y <- df$PLATO.hopdrop
## create model
mymodel<-drm(y ~x, fct = LL.4()) #4-parameter logistic with library(drc)
plot(mymodel, main = "default (logarithmic) x-axis")
plot(mymodel, log="", main = "non-logarithmic x-axis")
# create prediction intervals
newdata = data.frame(y = y, x = x)
conf95 <- as.data.frame(predict(mymodel, newdata,interval = "confidence", level = 0.95))
conf95$x <- x
pred95 <- as.data.frame(predict(mymodel, newdata, interval = "prediction", level = 0.95))
pred95$x <- x
pred999 <- as.data.frame(predict(mymodel, newdata, interval = "prediction", level = 0.999))
pred999$x <- x
mymodel
Note: If you compare the multi-level model parameter estimates for rouse=TRUE with those for unilevel model based on rouse=TRUE subset, the results are similar, but not identical!
plot prediction confidence intervals
# Asymmetric ribbons based on prediction intervals
#newdata <- data.frame(y = c(3.5, 3, 2.9, 2.5,1.8,1.7), x = c(1,5,10,20,30,40))
#newdata <- tidyfermi %>% filter(batch = SELECTBATCH) %>% select(days,plato)
qplot(data=pred95, x=x, y=Prediction, ymin=Lower, ymax=Prediction, geom="ribbon", fill=I("red"), alpha=I(0.2)) +
geom_ribbon(data=pred95, aes(x=x, ymin=Prediction, ymax=Upper), fill=I("blue"), alpha=I(0.2)) +
geom_line(data=pred95, aes(x=x, y=Prediction), color=I("green"), lwd=1)+
geom_point(data=newdata, aes(x=x, y=y, ymin=NULL, ymax=NULL), size=1, col="blue")+
ylab("y")

# Visualise intervals
# based on Maurits Evers (https://stackoverflow.com/questions/49444489/nonlinear-regression-prediction-in-r?rq=1)
data.frame(x=x, y=y) %>% ggplot(aes(x, y)) +
geom_point() +
geom_line(data = pred95, aes(x = x, y = Prediction)) +
geom_ribbon(data = pred999, aes(x = x, ymin = Lower, ymax = Upper),fill=I("pink"),alpha = 0.4)+
geom_ribbon(data = pred95, aes(x = x, ymin = Lower, ymax = Upper),fill=I("green"),alpha = 0.4)+
geom_ribbon(data = conf95, aes(x = x, ymin = Lower, ymax = Upper),fill=I("purple"),alpha = 0.4);

---
title: "R Notebook: Modeling the Freshening Power of the Hop"
output: html_notebook
---

> *Step1:  Open this .RMD file in R Studio and click the "Preview" button above.  A formatted version should pop up in a browser window.  IF not, troubleshoot that!*


# manual outline

1. Housekeeping
    1. *SET WORKING DIRECTORY* 
    1. this **R Markdown (.RMD)** document was created using R Studio
        1. plain text
        1. "Reproducible Research" Movement
    1. ***R*** from <https://www.r-project.org/>
    1. ***R Studio*** from <https://www.rstudio.com/products/rstudio/download/>
    1. Environment tab (Rstudio)
    1. Files/Plots/Packages/Help/Viewer  tab (Rstudio)
    1. Preview/view as HTML  (R Markdown in Rstudio)
    1. Outline tab (R Markdown in Rstudio)
    
* Intro
    + Statement of Problem
    + Overview of Experimental Data
    + Measuring the Freshening Power of the Hop (relative to non-dryhopped controls)
    + *calculated metrics*
        - $\Delta$ABV 
        - $\Delta$ABW
        - $\Delta{calculated}$CO$_2$
        - $\Delta$SG$^{20/20}$
        - fold-increase ethanol from hop addition (in $\frac{pounds}{bbl}$ and wt% or $\frac{grams}{100 mL}$)
        - fold-increase CO$_2$ from hop addition (in g/L and CO$_2$ volumes)
        - $\Delta$ &deg;plato
* Load Libraries
* Data Import, Tidy and Transform
* Data Visualization
    + scatter plots
    + scatter plots with added dimensions
* Modeling overview
    + linear regression
    + residuals
* Modeling FPH


# intro
* this is not a stats course.  use at your own risk.
* [*"Data is not information. Information is not knowledge. And knowledge
is certainly not wisdom."*](http://www.datagovernance.com/quotes/knowledge-quotes/ "Clifford Stoll")
* [Wickham's four pillars of data analysis:](https://r4ds.had.co.nz/ "Grolemund and Wickham's R for Data Science")
    * tidy
    * transform
    * visualize
    * model
    
This is primarily an excercise in data wrangling (tidy+transform) and markdown formatting using R Studio.  In that context, this analysis is a *preliminary* exploration of potential metrics for *Hop Freshening Power* (synonyms: 'dryhop creep', 'ABV creep', 'dry-hop creep') and to model this aspect of the warm-dryhopping process. 

```{r echo=TRUE}
#                       __  .__    .___      
#                     _/  |_|__| __| _/__.__.
#                     \   __\  |/ __ <   |  |
#                      |  | |  / /_/ |\___  |
#                      |__| |__\____ |/ ____|
#                                   \/\/     
#                                    \/
#                                    ||
#                                    ||
#                                    ||
#              _                     /\               
#             | |_ _ _ __ _ _ _  ___/ _|___ _ _ _ __  
#             |  _| '_/ _` | ' \(_-<  _/ _ \ '_| '  \ 
#              \__|_| \__,_|_||_/__/_| \___/_| |_|_|_|
#                        //                      \\
#                       //                        \\
#                      //                          \\
#                     //                            \\
#          _         //   _ _                        \\   _     _ 
#     __ _(_)____  _ __ _| (_)______        _ __  ___  __| |___| |
#     \ V / (_-< || / _` | | |_ / -_)------| '  \/ _ \/ _` / -_) |
#      \_/|_/__/\_,_\__,_|_|_/__\___|------|_|_|_\___/\__,_\___|_|
#
#ascii font/art from <http://www.network-science.de/ascii/>
```


## Statement of Problem
The warm-dryhopping process can profoundly impact the body, alcohol and vicinal diketone content of the beer.  Common metrics and models for this phenomenon have not been established.

## Overview of Experimental Data
Through experience in brewing we know that **endpoints** of dryhopping include:

* flavor impact (always)
* impact on visual/presentation (sometimes)
* *ethanol increase* accompanied by decrease in specific gravity (sometimes)
* CO2 increase (sometimes)
* diacetyl/VDK increase (sometimes)
* impact on yeast quality, and so on...

Experiments were carried out where 

1. multiple replicate 250-mL samples of [nearly] end-fermented beer were collected from fermenters into 12oz amber bottles
1. the samples were randomized into groups of three, then each group was either
    * dry-hopped at ~1.0 pounds/bbl (treatment), or
    * not dry-hopped (control)
1. all were hand-crimped with foil, stored for various times (mostly at room temp, a few in the cold), then tested on an Anton Paar DMA4500/Alcolyzer classic
        
Focusing on *ethanol increase as the endpoint*, we know these to be **relevant factors**:

* variety of hops (five different varieties)
* presence of live yeast (true in all cases for these data)
* temperature during dryhopping (mostly warm, with a few in the cold)
* amount of contact time between hops and beer (up to 7 weeks)

*For more details on this dataset see the [manuscript](https://www.tandfonline.com/doi/full/10.1080/03610470.2018.1469081?scroll=top&needAccess=true "Kirkendall2018").  
* please send comments, complaints and corrections to luke.chadwick@gmail.com!
* Data are from the file "ujbc_a_1469081_sm5496.txt" (supplementary data from Jacob A. Kirkendall, Carter A. Mitchell & Lucas R. Chadwick (2018): The Freshening Power of Centennial Hops, *Journal of the American Society of Brewing Chemists* Volume 76, Issue 3, Pages 178-184 (**2018**) DOI: 10.1080/03610470.2018.1469081* available from <https://www.tandfonline.com/doi/full/10.1080/03610470.2018.1469081?scroll=top&needAccess=true>


# load libraries
```{r package.load, results='hide', echo=TRUE} 
#.rs.restartR()  # restart R 
sessionInfo()

#data wrangling
#install.packages("dplyr")  # dplyr "data plyer"  
library(dplyr)
library(ggplot2)
library(tidyverse)

library(caret) # for dummyVars function
library(psych) # for pairs.panels function
library(memisc)  # compare models

library(drc)  # "drc" = dose response curves (contains a suite of flexible and versatile model fitting and after-fitting functions)
library(plotly)
library(gridExtra)  # for grid.arrange in R markdown
```



> flash forward! scroll to the bottom and run bigchunk_tidytransform


#import data
```{r}
rm(list = ls()) # clear workspace
mydata <-read.csv("ujbc_a_1469081_sm5496.txt",stringsAsFactors = FALSE)  ## SPECIFY filename

dput(names(mydata))
```


# !manual add time zero to "days on hops" experiment!
Anton Paar data were collected at time zero for kinetics experiment (triplicate samples run immediately after sample collection, while the remainder were being dry-hopped), but were inadvertently excluded from the original dataset.  Here they are:
Date	time	sample no.	ABV	ABW	OE (P)	Er	Ea	SG 20/20	RDF	ADF sample_id
07/27/17	3:26 PM	03_	6.68	5.2	15.95	6.08	3.74	1.01462	63.88	76.58	JK1-20-41
07/27/17	3:30 PM	04_	6.68	5.2	15.95	6.08	3.73	1.01461	63.89	76.6	JK1-20-50
07/27/17	3:34 PM	05_	6.68	5.2	15.95	6.08	3.74	1.01462	63.88	76.58	JK1-20-87

```{r}
## fill in MISSING DATA :(  add time zero data for time series analysis)

## nested ifelse to add time zero plato values for each expt:
mydata<- mydata %>% mutate(., initial_plato = ifelse(expt==" 2A", 3.74, ifelse(expt==" 1A", 3.69, ifelse(expt==" 1B", 3.53, ifelse(expt==" 3B", 3.54, ifelse(expt==" 4B", 4.66, NA))))))
                                             
## add complete time-zero anton paar data for time series (expt2A)
## quadruple-up each triplicate time-zero measurement (one copy for each NH/DH & rouse/still combo)
## note:  this practice is of questionable statistical rigor.  In retrospect it would have been better (and a lot easier) to just run 12 samples through Anton on day0!
timezero <- mydata %>% filter(expt==" 2A") %>% arrange(special_group) %>% slice(1:12)
#data.entry(timezero)
timezero$Test_Date <- rep(c("7/27/2017","7/27/2017","7/27/2017"),4)
timezero$Test.time <- rep(c("3:26 PM", "3:30 PM", "3:34 PM"),4)
timezero$ABV <- 6.68
timezero$ABW <- 5.2
timezero$OE <- 15.95
timezero$Er <- 6.08
timezero$Ea <- rep(c(3.74, 3.73, 3.74),4)
timezero$SG <- rep(c(1.01462, 1.01461, 1.01462),4)
timezero$RDF <- rep(c(63.88, 63.89, 63.88),4)
timezero$ADF <- rep(c(76.58, 76.60, 76.58),4)
timezero$Calories <- rep(c(215.37,215.34,215.35),4)
timezero[timezero$hop=="DH",]$contact_days <-0
timezero[,c(28:31)] <- 0
mydata <- rbind(timezero,mydata)
write.csv(mydata, "quickie.csv", row.names = FALSE)
mydata<-read.csv("quickie.csv", stringsAsFactors = FALSE)

###  this is a routine for inspecting data ###
mydatatypes<- as.data.frame(cbind(sapply(mydata, class)))  ## table of datatypes
mydatatypes$headers <- rownames(mydatatypes)    ## convert rownames to values
na_count <-as.data.frame(sapply(mydata, function(y) sum(length(which(is.na(y)))))) ## count NAs by column
mydataOverview<-as.data.frame(cbind(na_count,mydatatypes))  ## table of NA counts and datatypes
names(mydataOverview) <- c("NA_count","Data_Class", "Header")
#sum(is.na(mydata))  ## total count of NA values in entire sheet (231 in this case)
#mydataOverview %>% filter(NA_count>0)    ## breakdown of NA values
```


# tidy & transform 

> note: this is a base R +dplyr data wrangling exercise! in general it's best to use lubridate package for date/timestamps! 

* create a date column.  Here we create new date columns in POSIXct format, based on the particular date format used in the source data (ujbc_a_1469081_sm5496.txt; see ?as.POSIXct).  First an example of how NOT to create new variables in our dataframe:
```{r}
test_df<- mydata # (original dataframe to be used for testing/illustration)

x_brew_date.POSIXct<- as.POSIXct(mydata$brew_date, format = "%m/%d/%Y") # as a standalone vector (USELESS here), or:
test_df$brew_date.POSIXct <-as.POSIXct(mydata$brew_date, format = "%m/%d/%Y")  # as a "new column" in our dataframe
```


* ADVANCED data wrangling: 
    * create multiple date columns using FORLOOP
(here we take advantage of the pattern that of our (character) date/time columns have the string "date" in the headername.  First make character vector of all column names containing string "date", then run FORLOOP over each date column).  This forloop will convert datecols into POSIXct format (R/universal datetime format).  They say it's  **best to avoid forloops** in general, and **in this case you would probably use functions from lubridate package** - and in general be aware there's probably a specific tool for the task at hand)!!
```{r}
datecols<- dput(names(dplyr::select(mydata, matches("date"))))  ## headers containing string "date" => c("brew_date", "sample_collection_date", "dryhop_date", "Test_Date")
## FORLOOP
for (icol in datecols) {
  newcol = paste0(icol,".POSIXct")
 # print(newcol)
  mydata[, newcol] = as.POSIXct(mydata[, icol],format = "%m/%d/%Y") ###  CREATE NEW columns with POSIXct   
#  mydata[, newcol] = as.POSIXct(as.numeric(mydata[, icol])  * (60*60*24), origin="1899-12-30") ###  microsoft times
}
```


* create dayofaddition, daysonhops, hops_g_100mL, pounds_bbl variables with dplyr "mutate"
```{r}
mydata<- mydata %>% 
  mutate(dayofaddition = as.integer(as.numeric(difftime(dryhop_date.POSIXct,brew_date.POSIXct))),
         daysonhops = as.integer(as.numeric(difftime(Test_Date.POSIXct,dryhop_date.POSIXct))/(60*60*24)),
         hops_g_100mL = (mg_hops/volume_mL)/10,
         pounds_bbl = (mg_hops/volume_mL)*117/454
         )
```


* ADVANCED data wrangling: manually unpack overloaded columns with grepl
```{r}
mydata$OX<-grepl("OX", mydata$Hop_type)       ##  create logical "OX" column
mydata$rouse<-grepl("rouse", mydata$special_group)##  create logical column
mydata$Grind<-grepl("Grind", mydata$Hop_type)     ##  create logical column
mydata$Cone<-grepl("Cone", mydata$Hop_type)       ##  create logical column
mydata$harvest2014<-grepl("14", mydata$Hop_type)  ##  create logical column
mydata$harvest2015<-grepl("15", mydata$Hop_type)  ##  create logical column
mydata$harvest2017<-grepl("17", mydata$Hop_type)  ##  create logical column
```

* ADVANCED data wrangling: ifelse statements for harvestyear, form_of_hops, and DH_temp
```{r}
## ifelse statement for harvestyear (if neither 2014 nor 2015 nor 2017, then 2016)
mydata$harvestYear <- ifelse(
  mydata$harvest2014==TRUE, 2014,
  ifelse(mydata$harvest2015==TRUE, 2015, 
         ifelse(mydata$harvest2017==TRUE, 2017, 2016)))
dput(levels(as.factor(mydata$harvestYear)))
## ifelse statement for form of hops (if neither cone nor ground nor NH, then pellet) 
mydata$form_of_hops <- ifelse(
  mydata$Cone==TRUE, "cone",
  ifelse(mydata$Grind==TRUE, "ground",
         ifelse(mydata$Hop_type=="NH", "NH", "pellet")))
dput(levels(as.factor(mydata$form_of_hops)))
## ifelse statement for temperature greater or less than 10 
mydata$DH_temp <- ifelse(
  mydata$temp.C<10, "cold",
  ifelse(mydata$temp.C>10, "warm", "something else"))
dput(levels(as.factor(mydata$DH_temp)))
```

* ADVANCED data wrangling: create "variety" column starting with overloaded "Hop_type" column, then stripping away all the non-variety information
```{r}
mydata$variety<-mydata$Hop_type
mydata$variety<- gsub("[0-9]+","", mydata$variety) ## remove all numbers
mydata$variety<- gsub("OX","", mydata$variety)     ## remove specific text
mydata$variety<- gsub("Grind","", mydata$variety)
mydata$variety<- gsub("Cone","", mydata$variety)
mydata$variety<- gsub(" ","", mydata$variety)      ## remove spaces
dput(levels(as.factor(mydata$variety)))
```

* ADVANCED data wrangling: create "EXPTnew" (each unique experimental group designated in a single variable)
```{r}
mydata<- mydata %>% mutate(EXPTnew=paste0("group",expt, substr(special_group, 1,2),as.character(rouse)))
# clean it up by removing "NA" and any spaces due to canarycode bug
mydata$EXPTnew <- gsub(" ","", mydata$EXPTnew)  ## remove any spaces
mydata$EXPTnew <- gsub("NA","", mydata$EXPTnew) ## remove "NA"
dput(levels(as.factor(mydata$expt)))
dput(levels(as.factor(mydata$EXPTnew)))
```


## select and rearrange columns 
(experimental factors on the left, then measurements, followed by calculations and then all date columns on the right):
```{r}
mydata <- mydata %>% 
  dplyr::select(sample_id,expt, EXPTnew, hop, BINhop, variety, OX, harvestYear, form_of_hops,special_conditions, rouse, daysonhops, dayofaddition, DH_temp, temp.C, hops_g_100mL, pounds_bbl,
         ABV, ABW, OE, Er, Ea, SG, RDF, ADF, Calories, 
         dhop_day, contact_days, REF_NH, ABV_increase, 
         brew_date.POSIXct, sample_collection_date.POSIXct, dryhop_date.POSIXct,Test_Date.POSIXct, initial_plato)
## remove ".POSIXct" suffix.  Leaving it as-is will only add to confusion if/when these data are saved and re-imported (and become 'character' format again!)
colnames(mydata) = gsub(".POSIXct", "", colnames(mydata))
```



##compute mean NH (control) values 
(NH = not dry-hopped) 
```{r}
## mean NH (control) values for each EXPTnew group
mean.control_NH<- mydata %>% 
  group_by(EXPTnew) %>%
  filter(hop=="NH") %>%
  summarise_at(vars(ABV, ABW, Ea, SG),funs(mean, n()))
## replace "_mean" with ".control_NH" in column names
colnames(mean.control_NH) = gsub("_mean", ".control_NH", colnames(mean.control_NH))
```


##compute $\Delta$ values (differences relative to NH controls) using objects created above...
* difference of each ABV,ABW,Ea, and SG data point from corresponding mean.control_NH values (unhopped samples in same EXPTnew group)
```{r}
## first join our data with mean ABV for unhopped samples in given experiment (mean.control_NH; calculated above)

FPHcalc<- left_join(mydata, mean.control_NH, by="EXPTnew")

## calculate ABW_increase by subtracting each individual ABW measurement from respective mean.control_NH:
FPHcalc$delta.ABV <- FPHcalc$ABV - FPHcalc$ABV.control_NH
FPHcalc$delta.ABW <- FPHcalc$ABW - FPHcalc$ABW.control_NH
FPHcalc$delta.plato <- FPHcalc$Ea - FPHcalc$Ea.control_NH
FPHcalc$delta.SG <- FPHcalc$SG - FPHcalc$SG.control_NH

## the control samples have served their purpose, now remove them from dataset. The following calculations are only meaningful for dry-hopped samples.  
FPHcalc<- FPHcalc %>% filter(hop=="DH")
```


## *calculate* corresponding CO2 production 
* following Bamforth (describing Balling equation) "...more realistically, the ethanol yield is more like 0.46 g and carbon dioxide 0.44 g from 1 g sugar"  (p. 137 in Brewing Materials and Processes: A Practical Approach to Beer Excellence, Edited by Charles Bamforth Academic Press, 2016)
```{r}
FPHcalc$calcCO2_increase <- FPHcalc$delta.ABW*(0.44/0.46)
##convert calcCO2_increase (in g/100mL) to calculated CO2 volumes added
## g/L = 10* g/100mL
## The conversion factor from volumes of CO2 to CO2 by weight (g/L) is 1.96. For example: 2.5 volumes x 1.96 = 4.9 g/l.
FPHcalc$calcCO2vols_increase <- FPHcalc$calcCO2_increase*10/1.96
```



## define "FPH" as amount produced per amount dry-hops added (all in g/100mL):
```{r}
##  FPH = Fold Production due to Hops (fold-increase by mass: amount of given endpoint relative to amount of hops added)
##  
FPHcalc$FPH_EtOH = FPHcalc$delta.ABW/FPHcalc$hops_g_100mL
FPHcalc$FPH_CO2 = FPHcalc$calcCO2_increase/FPHcalc$hops_g_100mL
FPHcalc$FPH_plato = FPHcalc$delta.plato/FPHcalc$hops_g_100mL

## replacing time-zero time values with a very small number (rather than exactly zero) will prevent issues with analysis of nonlinear models
FPHcalc[FPHcalc$daysonhops==0,]$daysonhops <- 0.01



## and save the transformed data to csv:
write.csv(FPHcalc,"FPHcalc.csv", row.names = FALSE)
FPHcalc <- read.csv("FPHcalc.csv", stringsAsFactors = TRUE)

df<- FPHcalc %>% group_by(EXPTnew) %>%
  summarise_at(vars(hops_g_100mL,pounds_bbl,delta.ABV, delta.ABW, delta.plato, FPH_plato, FPH_EtOH, FPH_CO2, calcCO2vols_increase),funs(round(mean(.), 2))) %>%
  arrange(desc(FPH_EtOH)) 
df
```






# intermission?






# visualize
## vector~vector*vector plots
```{r regression.2}
df <-FPHcalc
p1<- ggplot(df, aes(y=FPH_EtOH,x=OE, color=contact_days)) + geom_point(size=2)
p2<- ggplot(df, aes(y=FPH_EtOH,x=ADF, color=contact_days)) + geom_point(size=2)
p3<- ggplot(df, aes(y=FPH_EtOH,x=ABW, color=contact_days)) + geom_point(size=2)
p4<- ggplot(df, aes(y=FPH_EtOH,x=Ea, color=contact_days)) + geom_point(size=2)
grid.arrange(p1, p2, p3, p4, ncol = 2)
```



## x~y*(4 vectors) by color
```{r regression.4plot.color}
df <- FPHcalc
x<- df$Ea
y<- df$ABW    #FPH_EtOH

p1<- ggplot(df, aes(x,y, color=form_of_hops)) + geom_point(size=2)
p2<- ggplot(df, aes(x,y, color=brew_date)) + geom_point(size=2)
p3<- ggplot(df, aes(x,y, color=rouse)) + geom_point(size=2)
p4<- ggplot(df, aes(x,y, color=pounds_bbl)) + geom_point(size=2)
grid.arrange(p1, p2, p3, p4, ncol = 2)
```

## 3D plots for html with library(plotly)
see https://plotly-r.com/the-plotly-cookbook.html for overview and some visualization inspiration. 
```{r}
# interactive 3D plots with library(plotly)
df <- FPHcalc # %>% filter(expt==" 2A")

# makes html very slow to render...comment out
#plot_ly(df, x = ~daysonhops, y = ~ABW, z = ~Ea) %>% add_markers(color = ~expt)
```

## Amunategui "Using Correlations To Understand Your Data"
```{r Amunategui}
mydata<-FPHcalc %>% dplyr::select(expt,variety,form_of_hops,pounds_bbl,special_conditions,rouse,daysonhops,DH_temp,brew_date,ABW,SG,OE,ADF,RDF,calcCO2vols_increase,FPH_CO2, FPH_EtOH)  ## last one is 100% in j

## note use of "dplyr::select" because one of these packages is conflicting with dplyr commands :()

## following  Manuel Amunategui  https://www.youtube.com/watch?v=igPQ-pI8Bjo
## Using Correlations To Understand Your Data: Machine Learning With R 
##functions for flattenSquareMatrix
cor.prob <- function (X, dfr=nrow(X) -2) {
  R<- cor(X, use="pairwise.complete.obs")
  above<- row(R) < col(R)
  r2 <- R[above]^2
  Fstat<- r2 * dfr/(1-r2)
  R[above] <- 1- pf(Fstat, 1, dfr)
  R[row(R) == col(R)] <- NA
  R
}
flattenSquareMatrix <- function(m) {
  if( (class(m) != "matrix") | (nrow(m)!=ncol(m))) stop("Must be a square matrix.")
  if(!identical(rownames(m), colnames(m))) stop("Row and column names must be equal.")
  ut <- upper.tri(m)
  data.frame(i = rownames(m)[row(m)[ut]],
             j = rownames(m)[col(m)[ut]],
             cor=t(m)[ut],
             p=m[ut])
}

## library(caret) to dummify everything (turn all characters&factors into columns;  ignores numbers and integers)
dmy<- dummyVars(" ~ .",data = mydata)
mydummifieddata<- data.frame(predict(dmy, newdata = mydata))
corMat = cor(mydummifieddata)
corMasterList<- flattenSquareMatrix(cor.prob(mydummifieddata)) ## list of all correlations
## order by strength of correlation
corlist<- corMasterList %>% arrange(-abs(corMasterList$cor)) 
write.csv(corlist,paste0("FLAT correlation matrix_.csv"))
corlist <- corlist %>% dplyr::filter(j=="FPH_EtOH")   ## filter specific endpoint
head(corlist,50)
```

## pairs.panels correlation matrix from library(psych)
```{r}
## specify interesting variables:  
interestingvariables<-c("ABW", "pounds_bbl", "daysonhops", "calcCO2vols_increase") 
pairs.panels(mydummifieddata[c(interestingvariables, "FPH_EtOH")])
```


#model
Observing the impacts of dryhopping in the presence of live yeast has led many brewing professionals to understand that FPH is a function of many of the variables above including hop variety,form_of_hops,harvestYear,OX,DH_temp,daysonhops,rouse,pounds_bbl.... Many have intuitively created a model in their heads (without necessarily thinking of it as such) and skillfully adjust process when necessary to account for this phenomenon.  In linear modeling, our function will take on the form:
$FPH =  intercept + \beta_{1}X_{1} + \beta_{2}X_{2} + ... + \beta_{n}X_{n}$ where $\beta$ values are what we're attempting to derive in this modeling exercise, and X values are (collectively) a particular set of conditions.

>> ActionItem:  add link to modeling overview


## linear models 1 (create models)
```{r}
df<-FPHcalc

lm1<-lm(df$SG~df$OE)
lm2<-lm(df$SG~df$ADF)
lm3<-lm(df$SG~df$ABW)
lm4<-lm(df$SG~df$Ea)
par(mfrow = c(2, 2), oma = c(0, 0, 0, 0))
plot(df$SG~df$OE)
abline(lm1)
plot(df$SG~df$ADF)
abline(lm2)
plot(df$SG~df$ABW)
abline(lm3)
plot(df$SG~df$Ea)
abline(lm4)
```

## linear models 2 (view residual plots)
```{r}
df<-FPHcalc

lm1<-lm(df$SG~df$OE)
lm2<-lm(df$SG~df$ADF)
lm3<-lm(df$SG~df$ABW)
lm4<-lm(df$SG~df$Ea)

par(mfrow = c(2, 2), oma = c(0, 0, 0, 0))
plot(lm1$residuals)
plot(lm2$residuals)
plot(lm3$residuals)
plot(lm4$residuals)
#summary(lm4)
```


## compare linear models using memisc package
```{r}
df<-FPHcalc

lm1<-lm(df$SG~df$OE)
lm2<-lm(df$SG~df$ADF)
lm3<-lm(df$SG~df$ABW)
lm4<-lm(df$SG~df$Ea)

mtable1234 <- mtable("Model 1"=lm1,"Model 2"=lm2,"Model 3"=lm3, "Model 4"=lm4,
                    summary.stats=c("sigma","R-squared","F","p","N"),show.eqnames=T)

mtable1234b <- relabel(mtable1234,
                      "(Intercept)" = "Constant",
                      x1 = "OE = Original Extract (g/100mL)",
                      x2 = "ADF = Apparent Degree of Fermentation (%)",
                      x3 = "ABW = Ethanol (w/w)",
                      x4 = "Er = Residual Extract (g/100mL)"
                      )
mtable1234
#show_html(mtable1234b)


```




## linear models of FPH
```{r}
df <- FPHcalc
lm1<-lm(FPH_EtOH~daysonhops, data=df)
lm2<-lm(FPH_EtOH~daysonhops*form_of_hops, data=df)
lm3<-lm(FPH_EtOH~daysonhops*rouse, data=df)
lm4<-lm(FPH_EtOH~daysonhops*form_of_hops*rouse, data=df)

mtable1234 <- mtable("Model 1"=lm1,"Model 2"=lm2,"Model 3"=lm3, "Model 4"=lm4,
                    summary.stats=c("sigma","R-squared","F","p","N"),show.eqnames=T)
mtable1234b <- relabel(mtable1234,
                      "(Intercept)" = "Constant",
                      SG = "Specific Gravity",
                      ABW = "ABW = Ethanol (w/w)",
                      Er = "Er = Residual Extract (g/100mL)"
                      )
mtable1234
#show_html(mtable1234b)
```



## The R Book (Crawley) Table 20.1: nonlinear functions useful in biology   
Table 20.1. [Useful non-linear functions](https://www.cs.upc.edu/~robert/teaching/estadistica/TheRBook.pdf "Michael J. Crawley.  The R book.  p. 738") EXPANDED:


| Function Class | name | equation | example code |example applications|
|:----------|:-----:|:-------|:-----|:-----|
| **Asymptotic functions**|Michaelis–Menten|$y =\frac{ax}{1+bx}$|nls(bone~a*age/(1+b*age),start=list(a=8,b=0.08))) | enzyme reactions |
| | | | nls(rate~SSmicmen(conc,a,b)) | tbd|
| |2-parameter asymptotic exponential | $y = a(1 − e^{−bx} )$ |nls(bone~a*(1-exp(-c*age)),start=list(a=120,c=0.064)) | tbd |
| |3-parameter asymptotic exponential | $y = a − be^{−cx}$ | nls(bone~a-b*exp(-c*age),start=list(a=120,b=110,c=0.064)) | tbd |
| | | | nls(bone~SSasymp(age,a,b,c)) | tbd |
| | | | nls(density ~ SSlogis(log(concentration), a, b, c)) | tbd |
|**S-shaped functions** |2-parameter logistic |$y = \frac{e^{a+bx}}{1 + e^{a+bx}}$ | | tbd |
| | 3-parameter logistic | $y = \frac{a}{1 + be^{−cx}}$ | | tbd |
| | 4-parameter logistic | $y = a + \frac{b-a}{1 + e^{(c−x)/d}}$ | nls(weight~SSfpl(Time, a, b, c, d)) | tbd |
| | Weibull | $y = a − be^{−(cx^d)}$ | nls(weight ~ SSweibull(time, Asym, Drop, lrc, pwr)) | tbd |
| | Gompertz | $y = ae^{−be^{−cx}}$ | | tbd |
| **Humped curves** | Ricker curve | $y = axe^{−bx}$ | | tbd |
| | First-order compartment | $y = k exp(−exp(a)x) − exp(−exp(b)x)$ | nls(conc~SSfol(Dose, Time, a, b, c)) | tbd |
| | Bell-shaped | $y = a exp(−ABS(bx)^2)$ | | tbd |
| | Biexponential | $y = ae^{bx} − ce^{−dx}$  | | tbd |



## base R nls function for Michaelis-Menton model
$y =\frac{ax}{1+bx}$

```{r}

expt2 <- FPHcalc %>% dplyr::filter(expt==" 2A") %>% dplyr::select(FPH_EtOH, rouse,special_conditions, daysonhops,brew_date, pounds_bbl,form_of_hops,ABW.control_NH,dayofaddition)

myfactor<- as.factor(expt2$special_conditions)  #rouse
cols <- as.numeric(myfactor)
legend.cols <- as.numeric(as.factor(levels(myfactor)))

my.MM.model <- nls(FPH_EtOH~a*daysonhops/(1+b*daysonhops),myfactor, data=expt2,start=list(a=8, b=0.08))

summary(my.MM.model)

```
So using the Michaelis-Menten equation as our model, the relationship between hop-induced ethanol production (FPH_EtOH) and hop contact time (daysonhops) would be expressed as:
$FPH_EtOH = \frac{0.183572*daysonhops}{1+0.066388*daysonhops}$
with a Residual standard error of 0.09307. As detailed below, logistic models do a better job than this, and using library(drc) makes it easier to introduce a multilevel factor.  

## predict and plot model
```{r}
x <- seq(0,50,0.1)
yv <- predict(my.MM.model, list(daysonhops=x))
{plot(expt2$daysonhops, expt2$FPH_EtOH, pch=21, col="purple", bg="green", main = "Michaelis-Menten model of ethanol production induced by dry-hopping")
  lines(x,yv,col="blue")}

```



##  Speers2003: Non‐Linear Modelling of Industrial Brewing Fermentations
R. Alex Speers, Peter Rogers, Bruce Smith
J. Inst. Brew. 109(3), 229–235, 2003 
<https://doi.org/10.1002/j.2050-0416.2003.tb00163.x>
Free Access from the Institute of Brewing!
<https://onlinelibrary.wiley.com/doi/10.1002/j.2050-0416.2003.tb00163.x>

Following Speers2003, the relationship between plato and time in primary fermentations is well modeled by a four parameter logistic model:

PLATO = PINF + (P_D / (1+ (EXP(-B*(TIME-M)))))

or

$PLATO = P_{inf}+ \frac{P_D}{1+e^{-B(t-M)}}$

where

* t = time into fermentation
* P_D =  change in Plato during fermentation (OE - P_{inf})
* B ~ maximum fermentation rate
* M = fermenation midpoint
* P_inf = final gravity


so then for our case

$PLATO = P_{inf}+  \frac{OE-P_{inf}}{1+e^{-F_{max}(daysonhops-M)}}$

PLATO = PINF + (P0 / (1+ (EXP(B*(HOURS-M)))))


##  MacIntosh2016: An Examination of Substrate and Product Kinetics During Brewing Fermentations
Andrew J. MacIntosh, Maria Josey, R. Alex Speers
[J. Am. Soc. Brew. Chem. 74(4), 250-257, 2016](https://www.asbcnet.org/publications/journal/vol/2016/Pages/ASBCJ-2016-4753-01.aspx)

"is only statistically advantageous to apply the five-parameter model when many data
points are available... (72 from this experiment as opposed to 10 when using Yeast-14)."

MacIntosh five-parameter logistic:  $P_{(t)} =\frac{P_i - P_e}{(1+s * e^{-B(t-M)})^{1/s}}$

library(drc)
getMeanFunctions()

* drc LL.4:  $f(x) = c + \frac{d-c}{1+\exp(b(\log(x)-\tilde{e}))}$  #4param logistic
* drc LL.5:  $f(x) = c + \frac{d-c}{(1+\exp(b(\log(x)-e)))^f}$      #5param logistic
* drc G.4 $f(x) = c + (d-c)(\exp(-\exp(b(x-e))))$                   #4param Gompertz  
* drc W1.4 $f(x) = c + (d-c) \exp(-\exp(b(\log(x)-\log(e))))$       #4param Weibull1
* drc W2.4 $f(x) = c + (d-c) (1 - \exp(-\exp(b(\log(x)-\log(e)))))$ #4param Weibull2


## explore nonlinear models of hop-induced Plato drop over time with library(drc) 
```{r drc.models}
df <- FPHcalc %>% filter(expt==" 2A") 
df$PLATO.hopdrop <- df$initial_plato + df$delta.plato

x <- df$daysonhops
y <- df$PLATO.hopdrop
group<- as.factor(df$special_conditions)   #rouse
cols <- as.numeric(group)
legend.cols <- as.numeric(as.factor(levels(group)))

## Fitting models using function drm from library(drc). see <http://rstats4ag.org/dose-response-curves.html> for overview and <https://www.rdocumentation.org/packages/drc/> for list of models (fct values) and other functions available in drc

## create models
#m.LL.3<-drm(y ~x,group, fct = LL.3()) #3-parameter logistic (lower limit at 0)
#m.LL.3u<-drm(y ~x, fct = LL.3u()) #3-parameter logistic (upper limit at 1)
m.LL.4<-drm(y ~x,group, fct = LL.4()) #4-parameter log-logistic  
# from ??LL.4:  f(x) = c + \frac{d-c}{1+\exp(b(\log(x)-\log(e)))} or in another parameterisation (converting the term \log(e) into a parameter) f(x) = c + \frac{d-c}{1+\exp(b(\log(x)-\tilde{e}))}  
m.L.4 <-drm(y ~x,group, fct = L.4()) #changing the fct = LL.4() to fct = L.4() allows plotting on a log10 scale
m.LL2.4<-drm(y ~x,group, fct = LL2.4()) #4-parameter log-logistic with log(e) rather than e as a parameter  
m.LL2.5<-drm(y ~x,group, fct = LL2.5()) #Generalised log-logistic
# from ??LL.2.5:   f(x) = c + \frac{d-c}{(1+\exp(b(\log(x)-e)))^f}
m.LL.5<-drm(y ~x,group, fct = LL.5()) #5-parameter logistic  
m.W1.4<-drm(y ~x,group, fct = W1.4()) #4-parameter Weibull1  
m.W2.4<-drm(y ~x,group, fct = W2.4()) #4-parameter Weibull2  
m.BC.5<-drm(y ~x,group, fct = BC.5()) #5-parameter Brain-Cousens (hormesis)
m.AR.3<-drm(y ~x,group, fct = AR.3()) #3-parameter Shifted asymptotic regression
#m.MM.2<-drm(y ~x,group, fct = MM.2()) #2-parameter Michaelis-Menten
m.MM.3<-drm(y ~x,group, fct = MM.3()) #3-parameter Michaelis-Menten


##plot
#par(mfrow = c(1,2), oma = c(0, 0, 0, 0))

{plot(x , y, col=cols, main="raw data")
legend("topright", legend=levels(group), pch=16, col=legend.cols)}

#plot(m.LL.3, type='all',col=cols, main="LL.3 (lower limit at 0)")
plot(m.LL.4, type='all',col=cols, main="LL.4 four-parameter log-logistic")
plot(m.L.4, type='all',col=cols, main="L.4")
plot(m.LL.5, type='all',col=cols, main="LL.5")
plot(m.LL2.4, type='all',col=cols, main="LL2.4 four-parameter log-logistic")
plot(m.LL2.5, type='all',col=cols, main="Generalised log-logistic")
plot(m.W1.4, type='all',col=cols, main="Weibull1")
plot(m.W2.4, type='all',col=cols, main="Weibull2")
plot(m.BC.5, type='all',col=cols, main="Brain-Cousens (hormesis)")
plot(m.AR.3, type='all',col=cols, main="Shifted asymptotic regression")
#plot(m.MM.2, type='all',col=cols, main="2-parameter Michaelis-Menten")
plot(m.MM.3, type='all',col=cols, main="3-parameter Michaelis-Menten")

```

## multilevel logistic models with library(drc)
```{r drc.models.levels}
df <- FPHcalc %>% filter(expt==" 2A") 
df$PLATO.hopdrop <- df$initial_plato + df$delta.plato
x <- df$daysonhops
y <- df$PLATO.hopdrop
myfactor<- as.factor(df$special_conditions)  #rouse
cols <- as.numeric(myfactor)
legend.cols <- as.numeric(as.factor(levels(myfactor)))

## create models
#m.LL.4<-drm(y ~x,myfactor, fct = LL.4(names = c("Slope", "Lower", "Upper", "Midpoint or ED50"))) #4-parameter log-logistic (general parameters)
#m.LL.4<-drm(y ~x,myfactor, fct = LL.4(names = c("Slope", "Lower", "Upper", "ED50"))) #4-parameter log-logistic (Dose-Response parameters)
m.LL.4<-drm(y ~x,myfactor, fct = LL.4(names = c("F_max", "P_inf", "OE", "M"))) #4-parameter log-logistic  
m.L.4 <-drm(y ~x,myfactor, fct = L.4()) #changing the fct = LL.4() to fct = L.4() allows plotting on a log10 scale
m.LL2.4<-drm(y ~x,myfactor, fct = LL2.4()) #4-parameter log-logistic with log(e) rather than e as a parameter  
m.LL2.5<-drm(y ~x,myfactor, fct = LL2.5()) #Generalised log-logistic
m.LL.5<-drm(y ~x,myfactor, fct = LL.5()) #5-parameter logistic  

#plot a few side by side
par(mfrow = c(1, 2), oma = c(0, 0, 0, 0))
plot(m.LL.4,log="",  broken = TRUE, bcontrol = list(style = "slash"), col = c(2,6,3,23,56), main = "4-parameter logistic")
plot(m.LL2.5,log="",  broken = TRUE, bcontrol = list(style = "slash"), col = c(2,6,3,23,56), main = "5-parameter logistic")

```


## plot residuals of nonlinear models generated with library(drc)
```{r}
par(mfrow = c(3, 2), oma = c(0, 0, 0, 0))

{plot(m.MM.3$predres, main = "Michaelis-Menten")  
  abline(0,0)}
{plot(m.W2.4$predres, main = "Weibull2") 
  abline(0,0)}
{plot(m.LL.4$predres, main = "4-parameter logistic") 
  abline(0,0)}
{plot(m.LL.5$predres, main = "5-parameter logistic") 
  abline(0,0)}
{plot(m.BC.5$predres, main = "Brain-Cousens (hormesis)") 
  abline(0,0)}
{plot(m.LL2.5$predres, main = "Generalised log-logistic") 
  abline(0,0)}
```


from ??BC.5:  The model function for the Brain-Cousens model (Brain and Cousens, 1989) is
$f(x, b,c,d,e,f) = c + \frac{d-c+fx}{1+\exp(b(\log(x)-\log(e)))}$
(doesn't look any better than LL.4, not sure why I'm even looking...)

```{r}
#pick one model to be mymodel
mymodel <- m.LL.4
mymodel
confint(mymodel)
summary(mymodel)

```

Each level (rouse and still) has a unique curve shape thereby resulting in different parameters for a best-fit log-logistic model.  Now that we know this, it is practical to address each set of conditions separately. Based on the 4-parameter logistic model LL.4 we have for roused samples:

$PLATO_{roused} = P_{inf}+  \frac{OE-P_{inf}}{1+e^{-F_{max}(daysonhops-M)}}$

or

$PLATO_{roused}  = 1.809 + \frac{1.94}{1+e^{-1.3923(daysonhops-12.274)}}$

and for untouched (still) samples:

$PLATO_{still}  = 2.149 + \frac{1.60}{1+e^{-1.8004(daysonhops-7.958)}}$

Plotting and analysis options for multilevel models are limited.  By filtering the data to focus on one set of conditions at a time we create 'simple' y~x logistic models. Many wonderful visualization tools are available for such x~y models.  Here's just a few:

## single level logistic model with confidence intervals 
(specify eval=false in R  notebook codechunk header {r eval=FALSE} to prevent multiple messages regarding "Recycling array of length 1 in array-vector arithmetic is deprecated" from appearing in html)
```{r eval=FALSE}
df <- FPHcalc %>% filter(expt==" 2A" & rouse==TRUE) 
df$PLATO.hopdrop <- df$initial_plato + df$delta.plato
x <- as.numeric(df$daysonhops)
y <- df$PLATO.hopdrop

## create model
mymodel<-drm(y ~x, fct = LL.4()) #4-parameter logistic with library(drc)

plot(mymodel, main = "default (logarithmic) x-axis")
plot(mymodel, log="", main = "non-logarithmic x-axis")

# create prediction intervals
newdata = data.frame(y = y, x = x)
conf95 <- as.data.frame(predict(mymodel, newdata,interval = "confidence", level = 0.95))
conf95$x <- x
pred95 <- as.data.frame(predict(mymodel, newdata, interval = "prediction", level = 0.95))
pred95$x <- x
pred999 <- as.data.frame(predict(mymodel, newdata, interval = "prediction", level = 0.999))
pred999$x <- x
mymodel
```
Note:  If you compare the multi-level model parameter estimates for rouse=TRUE with those for unilevel model based on rouse=TRUE subset, the results are similar, but not identical!

## plot prediction confidence intervals
```{r}

# Asymmetric ribbons based on prediction intervals
#newdata <- data.frame(y = c(3.5, 3, 2.9, 2.5,1.8,1.7), x = c(1,5,10,20,30,40))
#newdata <- tidyfermi %>% filter(batch = SELECTBATCH) %>% select(days,plato) 
qplot(data=pred95, x=x, y=Prediction, ymin=Lower, ymax=Prediction, geom="ribbon", fill=I("red"), alpha=I(0.2)) +
geom_ribbon(data=pred95, aes(x=x, ymin=Prediction, ymax=Upper), fill=I("blue"), alpha=I(0.2)) +
geom_line(data=pred95, aes(x=x, y=Prediction), color=I("green"), lwd=1)+
geom_point(data=newdata, aes(x=x, y=y, ymin=NULL, ymax=NULL), size=1, col="blue")+
 ylab("y")

# Visualise intervals
# based on Maurits Evers (https://stackoverflow.com/questions/49444489/nonlinear-regression-prediction-in-r?rq=1)
data.frame(x=x, y=y) %>% ggplot(aes(x, y)) +
geom_point() +
geom_line(data = pred95, aes(x = x, y = Prediction)) +
geom_ribbon(data = pred999, aes(x = x, ymin = Lower, ymax = Upper),fill=I("pink"),alpha = 0.4)+
geom_ribbon(data = pred95, aes(x = x, ymin = Lower, ymax = Upper),fill=I("green"),alpha = 0.4)+
geom_ribbon(data = conf95, aes(x = x, ymin = Lower, ymax = Upper),fill=I("purple"),alpha = 0.4);

```


# summary

For cases where 2016 Centennial hops are added to this particular base beer (sampled into 12oz bottles) near the end of primary fermentation at room temperature and bottles remain untouched (still), the 4-parameter logistic equation with the following sets of parameters is a reasonable model of the subsequent hop-induced plato drop:
$PLATO_{still} = f(daysonhops)  = 2.149 + \frac{1.60}{1+e^{-1.8004(daysonhops-7.958)}}$

The following 4-parameter logistic equation is a reasonable model for cases identical to the above, but where bottles are roused once daily:
$PLATO_{roused} = f(daysonhops)  = 1.809 + \frac{1.94}{1+e^{-1.3925(daysonhops-12.272)}}$

The residuals for the 5-parameter logistic models are similar to those obtained for the 4-parameter equation, but in the case of unroused samples a visual inspection reveals very different curves.  The 4-parameter logistic predicts a much shorter "lag time" than does the 5-parameter logistic.  More data points in the days after dryhopping would shed light on the question as to which model is superior.

Only a single variable (rousing vs. still) among several hundred variables encompassing raw materials, machines, people and process resulted in quite different estimated parameters for a 4-parameter logistic model.  These results highlight the fact that all process variables known to significantly impact the endpoints we wish to control must themselves be defined, controlled and/or documented if models such as these are to be useful to characterize the process in question.  

# bigchunk_tidytransform

```{r bigchunk_tidytransform, echo=FALSE}
mydata <-read.csv("ujbc_a_1469081_sm5496.txt",stringsAsFactors = FALSE)  ## SPECIFY filename

## fill in MISSING DATA :(  add time zero data for time series analysis)

## nested ifelse to add time zero plato values for each expt:
mydata<- mydata %>% mutate(., initial_plato = ifelse(expt==" 2A", 3.74, ifelse(expt==" 1A", 3.69, ifelse(expt==" 1B", 3.53, ifelse(expt==" 3B", 3.54, ifelse(expt==" 4B", 4.66, NA))))))
                                             
## add complete time-zero anton paar data for time series (expt2A)
## quadruple-up each triplicate time-zero measurement (one copy for each NH/DH & rouse/still combo)
## note:  this practice is of questionable statistical rigor.  In retrospect it would have been better (and a lot easier) to just run 12 samples through Anton on day0!
timezero <- mydata %>% filter(expt==" 2A") %>% arrange(special_group) %>% slice(1:12)
#data.entry(timezero)
timezero$Test_Date <- rep(c("7/27/2017","7/27/2017","7/27/2017"),4)
timezero$Test.time <- rep(c("3:26 PM", "3:30 PM", "3:34 PM"),4)
timezero$ABV <- 6.68
timezero$ABW <- 5.2
timezero$OE <- 15.95
timezero$Er <- 6.08
timezero$Ea <- rep(c(3.74, 3.73, 3.74),4)
timezero$SG <- rep(c(1.01462, 1.01461, 1.01462),4)
timezero$RDF <- rep(c(63.88, 63.89, 63.88),4)
timezero$ADF <- rep(c(76.58, 76.60, 76.58),4)
timezero$Calories <- rep(c(215.37,215.34,215.35),4)
timezero[timezero$hop=="DH",]$contact_days <-0
timezero[,c(28:31)] <- 0
mydata <- rbind(timezero,mydata)
write.csv(mydata, "quickie.csv", row.names = FALSE)
mydata<-read.csv("quickie.csv", stringsAsFactors = FALSE)

###  this is a routine for inspecting data ###
#mydatatypes<- as.data.frame(cbind(sapply(mydata, class)))  ## table of datatypes
#mydatatypes$headers <- rownames(mydatatypes)    ## convert rownames to values
#na_count <-as.data.frame(sapply(mydata, function(y) sum(length(which(is.na(y)))))) ## count NAs by column
#mydataOverview<-as.data.frame(cbind(na_count,mydatatypes))  ## table of NA counts and datatypes
#names(mydataOverview) <- c("NA_count","Data_Class", "Header")
#sum(is.na(mydata))  ## total count of NA values in entire sheet (231 in this case)
#mydataOverview %>% filter(NA_count>0)    ## breakdown of NA values

datecols<- dput(names(dplyr::select(mydata, matches("date"))))  ## headers containing string "date" => c("brew_date", "sample_collection_date", "dryhop_date", "Test_Date")
## FORLOOP
for (icol in datecols) {
  newcol = paste0(icol,".POSIXct")
 # print(newcol)
  mydata[, newcol] = as.POSIXct(mydata[, icol],format = "%m/%d/%Y") ###  CREATE NEW POSIXct columns   
#  mydata[, newcol] = as.POSIXct(as.numeric(mydata[, icol])  * (60*60*24), origin="1899-12-30") ###  microsoft times
}

## create dayofaddition, daysonhops, hops_g_100mL, pounds_bbl variables with dplyr "mutate"
mydata<- mydata %>% 
  mutate(dayofaddition = as.integer(as.numeric(difftime(dryhop_date.POSIXct,brew_date.POSIXct))),
         daysonhops = as.integer(as.numeric(difftime(Test_Date.POSIXct,dryhop_date.POSIXct))/(60*60*24)),
         hops_g_100mL = (mg_hops/volume_mL)/10,
         pounds_bbl = (mg_hops/volume_mL)*117/454
         )
         
mydata$OX<-grepl("OX", mydata$Hop_type)       ##  create logical "OX" column
mydata$rouse<-grepl("rouse", mydata$special_group)##  create logical column
mydata$Grind<-grepl("Grind", mydata$Hop_type)     ##  create logical column
mydata$Cone<-grepl("Cone", mydata$Hop_type)       ##  create logical column
mydata$harvest2014<-grepl("14", mydata$Hop_type)  ##  create logical column
mydata$harvest2015<-grepl("15", mydata$Hop_type)  ##  create logical column
mydata$harvest2017<-grepl("17", mydata$Hop_type)  ##  create logical column

## ifelse statement for harvestyear (if neither 2014 nor 2015 nor 2017, then 2016)
mydata$harvestYear <- ifelse(
  mydata$harvest2014==TRUE, 2014,
  ifelse(mydata$harvest2015==TRUE, 2015, 
         ifelse(mydata$harvest2017==TRUE, 2017, 2016)))

## ifelse statement for form of hops (if neither cone nor ground nor NH, then pellet) 
mydata$form_of_hops <- ifelse(
  mydata$Cone==TRUE, "cone",
  ifelse(mydata$Grind==TRUE, "ground",
         ifelse(mydata$Hop_type=="NH", "NH", "pellet")))

## ifelse statement for temperature greater or less than 10 
mydata$DH_temp <- ifelse(
  mydata$temp.C<10, "cold",
  ifelse(mydata$temp.C>10, "warm", "something else"))

mydata$variety<-mydata$Hop_type
mydata$variety<- gsub("[0-9]+","", mydata$variety) ## remove all numbers
mydata$variety<- gsub("OX","", mydata$variety)     ## remove specific text
mydata$variety<- gsub("Grind","", mydata$variety)
mydata$variety<- gsub("Cone","", mydata$variety)
mydata$variety<- gsub(" ","", mydata$variety)      ## remove spaces

mydata<- mydata %>% mutate(EXPTnew=paste0("group",expt, substr(special_group, 1,2),as.character(rouse)))
# clean it up by removing "NA" and any spaces due to canarycode bug
mydata$EXPTnew <- gsub(" ","", mydata$EXPTnew)  ## remove any spaces
mydata$EXPTnew <- gsub("NA","", mydata$EXPTnew) ## remove "NA"

## select and rearrange columns 
mydata <- mydata %>% 
  dplyr::select(sample_id,expt, EXPTnew, hop, BINhop, variety, OX, harvestYear, form_of_hops,special_conditions, rouse, daysonhops, dayofaddition, DH_temp, temp.C, hops_g_100mL, pounds_bbl,
         ABV, ABW, OE, Er, Ea, SG, RDF, ADF, Calories, 
         dhop_day, contact_days, REF_NH, ABV_increase, 
         brew_date.POSIXct, sample_collection_date.POSIXct, dryhop_date.POSIXct,Test_Date.POSIXct, initial_plato)
## remove ".POSIXct" suffix.  Leaving it as-is will only add to confusion if/when these data are saved and re-imported (and become 'character' format!)
colnames(mydata) = gsub(".POSIXct", "", colnames(mydata))
##compute mean NH (control) values 
## mean NH (control) values for each EXPTnew group
mean.control_NH<- mydata %>% 
  group_by(EXPTnew) %>%
  filter(hop=="NH") %>%
  summarise_at(vars(ABV, ABW, Ea, SG),funs(mean, n()))
## replace "_mean" with ".control_NH" in column names
colnames(mean.control_NH) = gsub("_mean", ".control_NH", colnames(mean.control_NH))
##compute $\Delta$ values (changes relative to un-dryhopped controls) using objects created above...
## first join our data with mean ABV for unhopped samples in given experiment (mean.control_NH; calculated above)

FPHcalc<- left_join(mydata, mean.control_NH, by="EXPTnew")

## calculate ABW_increase by subtracting each individual ABW measurement from respective mean.control_NH:
FPHcalc$delta.ABV <- FPHcalc$ABV - FPHcalc$ABV.control_NH
FPHcalc$delta.ABW <- FPHcalc$ABW - FPHcalc$ABW.control_NH
FPHcalc$delta.plato <- FPHcalc$Ea - FPHcalc$Ea.control_NH
FPHcalc$delta.SG <- FPHcalc$SG - FPHcalc$SG.control_NH

## the control samples have served their purpose, now remove them from dataset. The following calculations are only meaningful for dry-hopped samples.  
FPHcalc<- FPHcalc %>% filter(hop=="DH")
## *calcalate* corresponding CO2 production 
FPHcalc$calcCO2_increase <- FPHcalc$delta.ABW*(0.44/0.46)
##convert calcCO2_increase (in g/100mL) to calculated CO2 volumes added
## g/L = 10* g/100mL
## The conversion factor from volumes of CO2 to CO2 by weight (g/L) is 1.96. For example: 2.5 volumes x 1.96 = 4.9 g/l.
FPHcalc$calcCO2vols_increase <- FPHcalc$calcCO2_increase*10/1.96
## define "FPH" as amount produced per % dry-hops added (in g/100mL):
##  FPH = Fold Production due to Hops (fold-increase by mass: amount of given endpoint relative to amount of hops added)
##  
FPHcalc$FPH_EtOH = FPHcalc$delta.ABW/FPHcalc$hops_g_100mL
FPHcalc$FPH_CO2 = FPHcalc$calcCO2_increase/FPHcalc$hops_g_100mL
FPHcalc$FPH_plato = FPHcalc$delta.plato/FPHcalc$hops_g_100mL

## replacing time-zero time values with a very small number (rather than exactly zero) will prevent issues with analysis of nonlinear models
FPHcalc[FPHcalc$daysonhops==0,]$daysonhops <- 0.01

## and save the transformed data to csv:
write.csv(FPHcalc,"FPHcalc.csv", row.names = FALSE)
FPHcalc <- read.csv("FPHcalc.csv", stringsAsFactors = TRUE)

df<- FPHcalc %>% group_by(EXPTnew) %>%
  summarise_at(vars(hops_g_100mL,pounds_bbl,delta.ABV, delta.ABW, delta.plato, FPH_plato, FPH_EtOH, FPH_CO2, calcCO2vols_increase),funs(round(mean(.), 2))) %>%
  arrange(desc(FPH_EtOH)) 
df
```

#session info
```{r}
sessionInfo()
```

