Production All Data
# Load in the data
data = read.csv('Production/All_Data_Production_Summary.csv')
mcdi = read.csv('Production/production-mcdi.csv')
converge = read.csv('Production/All_Data_Production_Convergence.csv')
converge$converged = "No"
converge$converged[converge$Rhat_k < 1.2 & converge$Rhat_l < 1.2 & converge$Rhat_s < 1.2] = 'Yes'
message(paste('Percent Chains Converged:', 100*dim(converge[converge$converged=='Yes',])[1] / dim(converge)[1]))
## Percent Chains Converged: 100
d = merge(data, converge[c('language','item.id','converged')])
d = subset(d, converged=='Yes')
Check Model Fit
pred = merge(mcdi, d[,c('language', 'item.id', 'k_Mean', 'lambda_Mean', 'start_Mean')])
PRED = NULL
for(q in split(pred, paste(pred$language, pred$item))) {
train <- q
test <- q
train$successes = train$produces * train$n
train$failures = train$n - train$successes
g.Probit <- glm( cbind(successes, failures) ~ age, family=binomial(link="probit"), data=train)
g.Logit <- glm( cbind(successes, failures) ~ age, family=binomial(link="logit"), data=train)
PRED <- rbind(PRED, data.frame(language=q$language[1], item.id=q$item.id[1], item=q$item[1], category=q$category[1],
k=q$k_Mean[1], l=q$lambda_Mean[1], s=q$start_Mean[1], obs=test$produces, n=q$n[1], age=test$age,
Gamma=pgamma(test$age-q$start_Mean[1], shape=q$k_Mean[1], rate=q$lambda_Mean[1]),
Probit=predict(g.Probit, newdata=test, type="response"),
Logit=predict(g.Logit, newdata=test, type="response")
))
}
pred.m <- melt(PRED, id.vars=c('language', 'item.id', 'age', 'obs', 'n', 'category', 'item', 'k', 'l', 's'))
mycor.test <- function(df) {
cr <- cor.test(df$value, df$obs)
data.frame(estimate=cr$estimate, ymin=cr$conf.int[1], ymax=cr$conf.int[2])
}
m <- ddply( pred.m, c("language", "variable"), mycor.test)
m$Model <- m$variable
ggplot(m, aes(y=estimate, x=language, color=Model, ymin=ymin, ymax=ymax)) +
geom_point() + geom_errorbar() +
ylim(0,1) + theme_bw() +
theme(axis.text.x = element_text(angle = 90)) +
xlab("") + ylab("Coef. of Determination")

Check Parameter Values
# Remove k_Mean > 3000, which are all syntactic structures not lexical items
d = subset(d, k_Mean < 3000)
mean = ddply(d, .(language), summarise, k_M=mean(k_Mean), k_low=mean(k_HDIlow), k_high=mean(k_HDIhigh),
l_M=mean(lambda_Mean), l_low=mean(lambda_HDIlow), l_high=mean(lambda_HDIhigh),
s_M=mean(start_Mean), s_low=mean(start_HDIlow), s_high=mean(start_HDIhigh))
mean
## language k_M k_low k_high l_M l_low l_high
## 1 Cantonese 13.384701 3.2131192 23.814307 0.6012640 0.25130280 0.9471624
## 2 Croatian 12.089559 1.9733039 22.971015 0.5404920 0.16511126 0.9355322
## 3 Danish 10.700966 5.2738375 16.929480 0.4903437 0.32540963 0.6683131
## 4 English 11.428233 5.4411342 17.699654 0.5317706 0.34118729 0.7198872
## 5 German 14.772612 4.1255255 24.655968 0.6743865 0.32403650 1.0002087
## 6 Hebrew 16.012298 0.7462316 35.734628 0.8398933 0.08735008 1.7611053
## 7 Italian 8.142294 1.6986016 16.241610 0.4102736 0.16649108 0.6749154
## 8 Mandarin 7.758743 1.4633166 17.956013 0.5776843 0.25345863 0.9864020
## 9 Norwegian 5.021387 3.2914092 7.124552 0.3091743 0.24764179 0.3782119
## 10 Russian 5.830934 1.3340862 11.931618 0.2894680 0.12136921 0.4810541
## 11 Spanish 7.135218 2.3896737 11.663059 0.2918068 0.13546774 0.4447380
## 12 Swedish 15.214969 2.9555671 28.549485 0.7045917 0.25139306 1.1653782
## 13 Turkish 8.024480 3.6727073 12.395066 0.3449581 0.21699089 0.4689106
## s_M s_low s_high
## 1 5.894536 0.063793113 13.05651
## 2 5.771750 0.013516278 13.55681
## 3 8.155522 3.120217691 12.83247
## 4 5.422185 0.802269179 10.29357
## 5 5.335078 0.024957415 12.53934
## 6 5.258709 0.003696322 13.20732
## 7 8.502092 0.636813716 15.88387
## 8 9.850170 2.779015974 15.06770
## 9 12.557992 10.061623405 14.68695
## 10 9.458866 1.453883746 16.20147
## 11 4.653678 0.017270995 11.01608
## 12 6.603426 0.196792073 13.83729
## 13 5.289779 0.265590647 10.81956
Number of ELIs: k
ggplot(d, aes(x=language, y=k_Mean, fill=language)) +
geom_boxplot() +
ylab("Estimated k") + xlab("") + ylim(0, 65) +
guides(fill=F) + ggtitle('Production') +
theme_bw() +
theme(axis.text.x=element_text(angle=90))

Rate of ELIs: \(\lambda\)
ggplot(d, aes(x=language, y=lambda_Mean, fill=language)) +
geom_boxplot() +
ylab(expression(paste("Estimated ", lambda))) + xlab("") +
guides(fill=F) + ggtitle('Production') + ylim(0, 3) +
theme_bw() +
theme(axis.text.x=element_text(angle=90))

Start Time: s
ggplot(d, aes(x=language, y=start_Mean, fill=language)) +
geom_boxplot() +
ylab("Estimated start (months)") + xlab("") +
guides(fill=F) + ggtitle('Production') +
theme_bw() + ylim(0,20) +
theme(axis.text.x=element_text(angle=90))

Parameter Values by MCDI Category for English
eng = subset(d, language=='English')
Number of ELIs: k
ggplot(eng, aes(x=category, y=k_Mean, fill=category)) +
geom_boxplot() +
ylab("Estimated mean k") + xlab("") +
guides(fill=F) + ggtitle('Production') +
theme_bw() +
theme(axis.text.x=element_text(angle=90))

Rate of ELIs: \(\lambda\)
ggplot(eng, aes(x=category, y=lambda_Mean, fill=category)) +
geom_boxplot() +
ylab(expression(paste("Estimated mean ",lambda))) + xlab("") +
guides(fill=F) + ggtitle('Production') +
theme_bw() +
theme(axis.text.x=element_text(angle=90))

Start Time: s
ggplot(eng, aes(x=category, y=start_Mean, fill=category)) +
geom_boxplot() +
ylab("Estimated mean start time (months)") + xlab("") +
guides(fill=F) + ggtitle('Production') +
theme_bw() +
theme(axis.text.x=element_text(angle=90))

Correlating Parameter Values with Childes Frequency
childes <- read.table("Production/childes.txt", header=F)
names(childes) <- c("item", "freq")
freq <- merge(eng, childes)
Number of ELIs: k
r = with(freq, cor.test(log(freq), k_Mean))
message(paste('Correlation with Log Frequency: ', r$estimate, ', p=', r$p.value, sep=''))
## Correlation with Log Frequency: 0.194982456814472, p=1.49143990786666e-06
ggplot(freq, aes(x=log(freq), y=k_Mean)) +
geom_point(alpha=0.5) +
stat_smooth(method='lm',formula=y~x) +
xlab("Log Frequency") +
ylab(expression(paste("Estimated ", k))) +
theme_bw()

Rate of ELIs: \(\lambda\)
r = with(freq, cor.test(log(freq), lambda_Mean))
message(paste('Correlation with Log Frequency: ', r$estimate, ', p=', r$p.value, sep=''))
## Correlation with Log Frequency: 0.315992381475656, p=2.22542944734414e-15
ggplot(freq, aes(x=log(freq), y=lambda_Mean)) +
geom_point(alpha=0.5) +
stat_smooth(method='lm',formula=y~x) +
xlab("Log Frequency") +
ylab(expression(paste("Estimated ", lambda))) +
theme_bw()

Start Time: s
r = with(freq, cor.test(log(freq), start_Mean))
message(paste('Correlation with Log Frequency: ', r$estimate, ', p=', r$p.value, sep=''))
## Correlation with Log Frequency: -0.218282049687356, p=6.63045744244183e-08
ggplot(freq, aes(x=log(freq), y=start_Mean)) +
geom_point(alpha=0.5) +
stat_smooth(method='lm',formula=y~x) +
xlab("Log Frequency") +
ylab(expression(paste("Estimated ", 'start time (months)'))) +
theme_bw()

Comparison of Variance Explained by Waiting for Data vs. Start Time
bootByHand = function(df, iters) {
RESULTS = NULL
for (i in 1:iters) {
N = length(df$language)
inx = sample(1:N, N, replace=T)
set = df[inx,]
R.t = var(set$k_Mean / set$lambda_Mean)
R.a = var(set$start_Mean) + var(set$k_Mean / set$lambda_Mean)
PropVar = R.t / R.a
set$t = set$k_Mean / set$lambda_Mean
set$a = set$start_Mean + set$t
PropTime = set$t / set$a
result = data.frame(PropVar = PropVar, PropTime = PropTime)
RESULTS = rbind(RESULTS, result)
}
return(RESULTS)
}
propBootP = ddply(d, .(language), bootByHand, 1000)
prP = ddply(propBootP, .(language), summarise, lowVar = quantile(PropVar, probs=0.05), lowT = quantile(PropTime, probs=0.05),
highVar = quantile(PropVar, probs=0.95), highT = quantile(PropTime, probs=0.95))
get_prop = function(d) {
R.s.comp = var(d$start_Mean)
R.t.comp = var(d$k_Mean / d$lambda_Mean)
R.a.comp = var(d$start_Mean) + var(d$k_Mean / d$lambda_Mean)
Vars=R.t.comp/R.a.comp
set = d
set$t = set$k_Mean / set$lambda_Mean
set$a = set$start_Mean + set$t
propTime = mean(set$t / set$a)
return(data.frame(PropV=Vars, PropT=propTime))
}
proP = ddply(d, .(language), get_prop)
propP = merge(prP, proP)
ggplot(propP, aes(x=language, y=PropV, fill=language)) +
geom_bar(stat='identity', position=position_dodge(0.9)) +
geom_errorbar(aes(ymin=lowVar,ymax=highVar), position=position_dodge(0.9), width=0.3) +
geom_point(aes(x=language, y=PropT), shape=2) +
geom_pointrange(aes(ymin=lowT, ymax=highT)) +
ylab('Percent') +
xlab('') +
guides(fill=F) +
ggtitle('Production') +
theme_bw() +
theme(axis.text.x=element_text(angle=90))

Production First Half Data
# Load in the data
data = read.csv('Production/First_Half_Production_Summary.csv')
converge = read.csv('Production/First_Half_Production_Convergence.csv')
converge$converged = "No"
converge$converged[converge$Rhat_k < 1.2 & converge$Rhat_l < 1.2 & converge$Rhat_s < 1.2] = 'Yes'
message(paste('Percent Chains Converged:', 100*dim(converge[converge$converged=='Yes',])[1] / dim(converge)[1]))
## Percent Chains Converged: 99.1908713692946
d = merge(data, converge[c('language','item.id','converged')])
d = subset(d, converged=='Yes')
Check Predictive Ability
pred = merge(mcdi, d[,c('language', 'item.id', 'k_Mean', 'lambda_Mean', 'start_Mean')])
PRED = NULL
for(q in split(pred, paste(pred$language, pred$item))) {
train <- subset(q, age<median(age))
test <- subset(q, age>=median(age))
train$successes = train$produces * train$n
train$failures = train$n - train$successes
g.Probit <- glm( cbind(successes, failures) ~ age, family=binomial(link="probit"), data=train)
g.Logit <- glm( cbind(successes, failures) ~ age, family=binomial(link="logit"), data=train)
PRED <- rbind(PRED, data.frame(language=q$language[1], item.id=q$item.id[1], item=q$item[1], category=q$category[1],
k=q$k_Mean[1], l=q$lambda_Mean[1], s=q$start_Mean[1], obs=test$produces, n=q$n[1], age=test$age,
Gamma=pgamma(test$age-q$start_Mean[1], shape=q$k_Mean[1], rate=q$lambda_Mean[1]),
Probit=predict(g.Probit, newdata=test, type="response"),
Logit=predict(g.Logit, newdata=test, type="response")
))
}
pred.m <- melt(PRED, id.vars=c('language', 'item.id', 'age', 'obs', 'n', 'category', 'item', 'k', 'l', 's'))
mycor.test <- function(df) {
cr <- cor.test(df$value, df$obs)
data.frame(estimate=cr$estimate, ymin=cr$conf.int[1], ymax=cr$conf.int[2])
}
m <- ddply( pred.m, c("language", "variable"), mycor.test)
m$Model <- m$variable
ggplot(m, aes(y=estimate, x=language, color=Model, ymin=ymin, ymax=ymax)) +
geom_point(position=position_dodge(.2)) +
geom_errorbar(position=position_dodge(.2),width=2) +
ylim(0,1) + theme_bw() +
theme(axis.text.x = element_text(angle = 90)) +
xlab("") + ylab("Coef. of Determination") +
ggtitle('Production')
