Comprehension All Data
# Load in the data
data = read.csv('Comprehension/All_Data_Comprehension_Summary.csv')
mcdi = read.csv('Comprehension/comprehension-mcdi.csv')
converge = read.csv('Comprehension/All_Data_Comprehension_Convergence.csv')
converge$converged = "No"
converge$converged[converge$Rhat_k < 1.2 & converge$Rhat_l < 1.2 & converge$Rhat_s < 1.2] = 'Yes'
message(paste('Percent Chains Converged:', 100*dim(converge[converge$converged=='Yes',])[1] / dim(converge)[1]))
## Percent Chains Converged: 100
d = merge(data, converge[c('language','item.id','converged')])
d = subset(d, converged=='Yes')
Check Model Fit
pred = merge(mcdi, d[,c('language', 'item.id', 'k_Mean', 'lambda_Mean', 'start_Mean')])
PRED = NULL
for(q in split(pred, paste(pred$language, pred$item))) {
train <- q
test <- q
train$successes = train$produces * train$n
train$failures = train$n - train$successes
g.Probit <- glm( cbind(successes, failures) ~ age, family=binomial(link="probit"), data=train)
g.Logit <- glm( cbind(successes, failures) ~ age, family=binomial(link="logit"), data=train)
PRED <- rbind(PRED, data.frame(language=q$language[1], item.id=q$item.id[1], item=q$item[1], category=q$category[1],
k=q$k_Mean[1], l=q$lambda_Mean[1], s=q$start_Mean[1], obs=test$produces, n=q$n[1], age=test$age,
Gamma=pgamma(test$age-q$start_Mean[1], shape=q$k_Mean[1], rate=q$lambda_Mean[1]),
Probit=predict(g.Probit, newdata=test, type="response"),
Logit=predict(g.Logit, newdata=test, type="response")
))
}
pred.m <- melt(PRED, id.vars=c('language', 'item.id', 'age', 'obs', 'n', 'category', 'item', 'k', 'l', 's'))
mycor.test <- function(df) {
cr <- cor.test(df$value, df$obs)
data.frame(estimate=cr$estimate, ymin=cr$conf.int[1], ymax=cr$conf.int[2])
}
m <- ddply( pred.m, c("language", "variable"), mycor.test)
m$Model <- m$variable
ggplot(m, aes(y=estimate, x=language, color=Model, ymin=ymin, ymax=ymax)) +
geom_point() + geom_errorbar() +
ylim(0,1) + theme_bw() +
theme(axis.text.x = element_text(angle = 90)) +
xlab("") + ylab("Coef. of Determination")

Check Parameter Values
mean = ddply(d, .(language), summarise, k_M=mean(k_Mean), k_low=mean(k_HDIlow), k_high=mean(k_HDIhigh),
l_M=mean(lambda_Mean), l_low=mean(lambda_HDIlow), l_high=mean(lambda_HDIhigh),
s_M=mean(start_Mean), s_low=mean(start_HDIlow), s_high=mean(start_HDIhigh))
mean
## language k_M k_low k_high l_M l_low l_high
## 1 Croatian 9.644016 1.8062622 18.373625 0.7315679 0.1938595 1.3284935
## 2 English 8.029787 4.2911387 11.515038 0.5478457 0.3504155 0.7416199
## 3 Hebrew 7.620641 0.4732498 17.421344 0.4878021 0.0514822 1.0246063
## 4 Norwegian 8.373868 5.2124544 11.315159 0.5145952 0.3667719 0.6580483
## 5 Russian 4.431433 1.5429016 7.240964 0.3294491 0.1525450 0.5074002
## 6 Spanish 3.793611 1.4722129 6.177323 0.2186548 0.0880393 0.3571201
## 7 Swedish 11.730576 2.0890726 22.591622 0.8152540 0.2294195 1.4749902
## 8 Turkish 4.100707 1.4181395 6.746976 0.2736391 0.1083361 0.4406555
## s_M s_low s_high
## 1 2.721253 0.008963274 6.483108
## 2 1.747270 0.005908660 4.311720
## 3 4.178839 0.001212481 10.109007
## 4 1.833643 0.100152903 4.163789
## 5 2.387364 0.001020219 5.654411
## 6 2.117560 0.007031424 5.145217
## 7 3.084349 0.006054663 7.378341
## 8 2.407704 0.020900557 5.636569
Number of ELIs: k
ggplot(d, aes(x=language, y=k_Mean, fill=language)) +
geom_boxplot() +
ylab("Estimated k") + xlab("") + ylim(0, 65) +
guides(fill=F) + ggtitle('Comprehension') +
theme_bw() +
theme(axis.text.x=element_text(angle=90))

Rate of ELIs: \(\lambda\)
ggplot(d, aes(x=language, y=lambda_Mean, fill=language)) +
geom_boxplot() +
ylab(expression(paste("Estimated ", lambda))) + xlab("") +
guides(fill=F) + ggtitle('Comprehension') + ylim(0, 3) +
theme_bw() +
theme(axis.text.x=element_text(angle=90))

Start Time: s
ggplot(d, aes(x=language, y=start_Mean, fill=language)) +
geom_boxplot() +
ylab("Estimated start (months)") + xlab("") +
guides(fill=F) + ggtitle('Comprehension') +
theme_bw() + ylim(0,20) +
theme(axis.text.x=element_text(angle=90))

Parameter Values by MCDI Category for English
eng = subset(d, language=='English')
Number of ELIs: k
ggplot(eng, aes(x=category, y=k_Mean, fill=category)) +
geom_boxplot() +
ylab("Estimated mean k") + xlab("") +
guides(fill=F) + ggtitle('Comprehension') +
theme_bw() +
theme(axis.text.x=element_text(angle=90))

Rate of ELIs: \(\lambda\)
ggplot(eng, aes(x=category, y=lambda_Mean, fill=category)) +
geom_boxplot() +
ylab(expression(paste("Estimated mean ",lambda))) + xlab("") +
guides(fill=F) + ggtitle('Comprehension') +
theme_bw() +
theme(axis.text.x=element_text(angle=90))

Start Time: s
ggplot(eng, aes(x=category, y=start_Mean, fill=category)) +
geom_boxplot() +
ylab("Estimated mean start time (months)") + xlab("") +
guides(fill=F) + ggtitle('Comprehension') +
theme_bw() +
theme(axis.text.x=element_text(angle=90))

Correlating Parameter Values with Childes Frequency
childes <- read.table("Comprehension/childes.txt", header=F)
names(childes) <- c("item", "freq")
freq <- merge(eng, childes)
Number of ELIs: k
r = with(freq, cor.test(log(freq), k_Mean))
message(paste('Correlation with Log Frequency: ', r$estimate, ', p=', r$p.value, sep=''))
## Correlation with Log Frequency: -0.138032644186667, p=0.0100437519818091
ggplot(freq, aes(x=log(freq), y=k_Mean)) +
geom_point(alpha=0.5) +
stat_smooth(method='lm',formula=y~x) +
xlab("Log Frequency") +
ylab(expression(paste("Estimated ", k))) +
theme_bw()

Rate of ELIs: \(\lambda\)
r = with(freq, cor.test(log(freq), lambda_Mean))
message(paste('Correlation with Log Frequency: ', r$estimate, ', p=', r$p.value, sep=''))
## Correlation with Log Frequency: -0.00683562220905604, p=0.899038891434976
ggplot(freq, aes(x=log(freq), y=lambda_Mean)) +
geom_point(alpha=0.5) +
stat_smooth(method='lm',formula=y~x) +
xlab("Log Frequency") +
ylab(expression(paste("Estimated ", lambda))) +
theme_bw()

Start Time: s
r = with(freq, cor.test(log(freq), start_Mean))
message(paste('Correlation with Log Frequency: ', r$estimate, ', p=', r$p.value, sep=''))
## Correlation with Log Frequency: 0.0383836163574097, p=0.476037437866545
ggplot(freq, aes(x=log(freq), y=start_Mean)) +
geom_point(alpha=0.5) +
stat_smooth(method='lm',formula=y~x) +
xlab("Log Frequency") +
ylab(expression(paste("Estimated ", 'start time (months)'))) +
theme_bw()

Comparison of Variance Explained by Waiting for Data vs. Start Time
bootByHand = function(df, iters) {
RESULTS = NULL
for (i in 1:iters) {
N = length(df$language)
inx = sample(1:N, N, replace=T)
set = df[inx,]
R.t = var(set$k_Mean / set$lambda_Mean)
R.a = var(set$start_Mean) + var(set$k_Mean / set$lambda_Mean)
PropVar = R.t / R.a
set$t = set$k_Mean / set$lambda_Mean
set$a = set$start_Mean + set$t
PropTime = set$t / set$a
result = data.frame(PropVar = PropVar, PropTime = PropTime)
RESULTS = rbind(RESULTS, result)
}
return(RESULTS)
}
propBootC = ddply(d, .(language), bootByHand, 1000)
prC = ddply(propBootC, .(language), summarise, lowVar = quantile(PropVar, probs=0.05), lowT = quantile(PropTime, probs=0.05),
highVar = quantile(PropVar, probs=0.95), highT = quantile(PropTime, probs=0.95))
get_prop = function(d) {
R.s.comp = var(d$start_Mean)
R.t.comp = var(d$k_Mean / d$lambda_Mean)
R.a.comp = var(d$start_Mean) + var(d$k_Mean / d$lambda_Mean)
Vars=R.t.comp/R.a.comp
set = d
set$t = set$k_Mean / set$lambda_Mean
set$a = set$start_Mean + set$t
propTime = mean(set$t / set$a)
return(data.frame(PropV=Vars, PropT=propTime))
}
proC = ddply(d, .(language), get_prop)
propC = merge(prC, proC)
ggplot(propC, aes(x=language, y=PropV, fill=language)) +
geom_bar(stat='identity', position=position_dodge(0.9)) +
geom_errorbar(aes(ymin=lowVar,ymax=highVar), position=position_dodge(0.9), width=0.3) +
geom_point(aes(x=language, y=PropT), shape=2) +
geom_pointrange(aes(ymin=lowT, ymax=highT)) +
ylab('Percent') +
xlab('') +
guides(fill=F) +
ggtitle('Comprehension') +
theme_bw() +
theme(axis.text.x=element_text(angle=90))

Comprehension First Half Data
# Load in the data
data = read.csv('Comprehension/First_Half_Comprehension_Summary.csv')
converge = read.csv('Comprehension/First_Half_Comprehension_Convergence.csv')
converge$converged = "No"
converge$converged[converge$Rhat_k < 1.2 & converge$Rhat_l < 1.2 & converge$Rhat_s < 1.2] = 'Yes'
message(paste('Percent Chains Converged:', 100*dim(converge[converge$converged=='Yes',])[1] / dim(converge)[1]))
## Percent Chains Converged: 96.3860369609856
d = merge(data, converge[c('language','item.id','converged')])
d = subset(d, converged=='Yes')
Check Predictive Ability
pred = merge(mcdi, d[,c('language', 'item.id', 'k_Mean', 'lambda_Mean', 'start_Mean')])
PRED = NULL
for(q in split(pred, paste(pred$language, pred$item))) {
train <- subset(q, age<median(age))
test <- subset(q, age>=median(age))
train$successes = train$produces * train$n
train$failures = train$n - train$successes
g.Probit <- glm( cbind(successes, failures) ~ age, family=binomial(link="probit"), data=train)
g.Logit <- glm( cbind(successes, failures) ~ age, family=binomial(link="logit"), data=train)
PRED <- rbind(PRED, data.frame(language=q$language[1], item.id=q$item.id[1], item=q$item[1], category=q$category[1],
k=q$k_Mean[1], l=q$lambda_Mean[1], s=q$start_Mean[1], obs=test$produces, n=q$n[1], age=test$age,
Gamma=pgamma(test$age-q$start_Mean[1], shape=q$k_Mean[1], rate=q$lambda_Mean[1]),
Probit=predict(g.Probit, newdata=test, type="response"),
Logit=predict(g.Logit, newdata=test, type="response")
))
}
pred.m <- melt(PRED, id.vars=c('language', 'item.id', 'age', 'obs', 'n', 'category', 'item', 'k', 'l', 's'))
mycor.test <- function(df) {
cr <- cor.test(df$value, df$obs)
data.frame(estimate=cr$estimate, ymin=cr$conf.int[1], ymax=cr$conf.int[2])
}
m <- ddply( pred.m, c("language", "variable"), mycor.test)
m$Model <- m$variable
ggplot(m, aes(y=estimate, x=language, color=Model, ymin=ymin, ymax=ymax)) +
geom_point(position=position_dodge(.2)) +
geom_errorbar(position=position_dodge(.2),width=2) +
ylim(0,1) + theme_bw() +
theme(axis.text.x = element_text(angle = 90)) +
xlab("") + ylab("Coef. of Determination") +
ggtitle('Comprehension')
