Comprehension All Data

# Load in the data
data = read.csv('Comprehension/All_Data_Comprehension_Summary.csv')
mcdi = read.csv('Comprehension/comprehension-mcdi.csv')
converge = read.csv('Comprehension/All_Data_Comprehension_Convergence.csv')
converge$converged = "No"
converge$converged[converge$Rhat_k < 1.2 & converge$Rhat_l < 1.2 & converge$Rhat_s < 1.2] = 'Yes'

message(paste('Percent Chains Converged:', 100*dim(converge[converge$converged=='Yes',])[1] / dim(converge)[1]))
## Percent Chains Converged: 100
d = merge(data, converge[c('language','item.id','converged')])
d = subset(d, converged=='Yes')

Check Model Fit

pred = merge(mcdi, d[,c('language', 'item.id', 'k_Mean', 'lambda_Mean', 'start_Mean')])

PRED = NULL
for(q in split(pred, paste(pred$language, pred$item))) {
    train <- q
    test  <- q
    train$successes = train$produces * train$n
    train$failures = train$n - train$successes
    
    g.Probit <- glm( cbind(successes, failures) ~ age, family=binomial(link="probit"), data=train)
    g.Logit  <- glm( cbind(successes, failures) ~ age, family=binomial(link="logit"),  data=train)
            
    PRED <- rbind(PRED, data.frame(language=q$language[1], item.id=q$item.id[1], item=q$item[1], category=q$category[1], 
                                   k=q$k_Mean[1], l=q$lambda_Mean[1], s=q$start_Mean[1], obs=test$produces, n=q$n[1], age=test$age,
                                    Gamma=pgamma(test$age-q$start_Mean[1], shape=q$k_Mean[1], rate=q$lambda_Mean[1]),
                                    Probit=predict(g.Probit, newdata=test, type="response"),
                                    Logit=predict(g.Logit, newdata=test, type="response")
                                    ))
}

pred.m <- melt(PRED, id.vars=c('language', 'item.id', 'age', 'obs', 'n', 'category', 'item', 'k', 'l', 's'))

mycor.test <- function(df) {
           cr <- cor.test(df$value, df$obs) 
           data.frame(estimate=cr$estimate, ymin=cr$conf.int[1], ymax=cr$conf.int[2])
 }

m <- ddply( pred.m, c("language", "variable"), mycor.test)

m$Model <- m$variable

ggplot(m, aes(y=estimate, x=language, color=Model, ymin=ymin, ymax=ymax)) + 
        geom_point() + geom_errorbar() + 
        ylim(0,1) + theme_bw() +
        theme(axis.text.x = element_text(angle = 90)) + 
        xlab("") + ylab("Coef. of Determination")

Check Parameter Values

mean = ddply(d, .(language), summarise, k_M=mean(k_Mean), k_low=mean(k_HDIlow), k_high=mean(k_HDIhigh), 
                                        l_M=mean(lambda_Mean), l_low=mean(lambda_HDIlow), l_high=mean(lambda_HDIhigh), 
                                        s_M=mean(start_Mean), s_low=mean(start_HDIlow), s_high=mean(start_HDIhigh))
mean
##    language       k_M     k_low    k_high       l_M     l_low    l_high
## 1  Croatian  9.644016 1.8062622 18.373625 0.7315679 0.1938595 1.3284935
## 2   English  8.029787 4.2911387 11.515038 0.5478457 0.3504155 0.7416199
## 3    Hebrew  7.620641 0.4732498 17.421344 0.4878021 0.0514822 1.0246063
## 4 Norwegian  8.373868 5.2124544 11.315159 0.5145952 0.3667719 0.6580483
## 5   Russian  4.431433 1.5429016  7.240964 0.3294491 0.1525450 0.5074002
## 6   Spanish  3.793611 1.4722129  6.177323 0.2186548 0.0880393 0.3571201
## 7   Swedish 11.730576 2.0890726 22.591622 0.8152540 0.2294195 1.4749902
## 8   Turkish  4.100707 1.4181395  6.746976 0.2736391 0.1083361 0.4406555
##        s_M       s_low    s_high
## 1 2.721253 0.008963274  6.483108
## 2 1.747270 0.005908660  4.311720
## 3 4.178839 0.001212481 10.109007
## 4 1.833643 0.100152903  4.163789
## 5 2.387364 0.001020219  5.654411
## 6 2.117560 0.007031424  5.145217
## 7 3.084349 0.006054663  7.378341
## 8 2.407704 0.020900557  5.636569

Number of ELIs: k

ggplot(d, aes(x=language, y=k_Mean, fill=language)) + 
    geom_boxplot() + 
    ylab("Estimated k") + xlab("") + ylim(0, 65) +
    guides(fill=F) + ggtitle('Comprehension') +
    theme_bw() +
    theme(axis.text.x=element_text(angle=90))

Rate of ELIs: \(\lambda\)

ggplot(d, aes(x=language, y=lambda_Mean, fill=language)) + 
    geom_boxplot() + 
    ylab(expression(paste("Estimated ", lambda))) + xlab("") +
    guides(fill=F) + ggtitle('Comprehension') + ylim(0, 3) +
    theme_bw() +
    theme(axis.text.x=element_text(angle=90))

Start Time: s

ggplot(d, aes(x=language, y=start_Mean, fill=language)) + 
    geom_boxplot() + 
    ylab("Estimated start (months)") + xlab("") +
    guides(fill=F) + ggtitle('Comprehension') +
    theme_bw() + ylim(0,20) +
    theme(axis.text.x=element_text(angle=90))

Parameter Values by MCDI Category for English

eng = subset(d, language=='English')

Number of ELIs: k

ggplot(eng, aes(x=category, y=k_Mean, fill=category)) + 
    geom_boxplot() + 
    ylab("Estimated mean k") + xlab("") +
    guides(fill=F) + ggtitle('Comprehension') +
    theme_bw() +
    theme(axis.text.x=element_text(angle=90))

Rate of ELIs: \(\lambda\)

ggplot(eng, aes(x=category, y=lambda_Mean, fill=category)) + 
    geom_boxplot() + 
    ylab(expression(paste("Estimated mean ",lambda))) + xlab("") +
    guides(fill=F) + ggtitle('Comprehension') +
    theme_bw() +
    theme(axis.text.x=element_text(angle=90))

Start Time: s

ggplot(eng, aes(x=category, y=start_Mean, fill=category)) + 
    geom_boxplot() + 
    ylab("Estimated mean start time (months)") + xlab("") +
    guides(fill=F) + ggtitle('Comprehension') +
    theme_bw() +
    theme(axis.text.x=element_text(angle=90))

Correlating Parameter Values with Childes Frequency

childes <- read.table("Comprehension/childes.txt", header=F)
names(childes) <- c("item", "freq")
freq <- merge(eng, childes)

Number of ELIs: k

r = with(freq, cor.test(log(freq), k_Mean))
message(paste('Correlation with Log Frequency: ', r$estimate, ', p=', r$p.value, sep=''))
## Correlation with Log Frequency: -0.138032644186667, p=0.0100437519818091
ggplot(freq, aes(x=log(freq), y=k_Mean)) + 
    geom_point(alpha=0.5) + 
    stat_smooth(method='lm',formula=y~x) + 
    xlab("Log Frequency") + 
    ylab(expression(paste("Estimated ", k))) + 
    theme_bw()

Rate of ELIs: \(\lambda\)

r = with(freq, cor.test(log(freq), lambda_Mean))
message(paste('Correlation with Log Frequency: ', r$estimate, ', p=', r$p.value, sep=''))
## Correlation with Log Frequency: -0.00683562220905604, p=0.899038891434976
ggplot(freq, aes(x=log(freq), y=lambda_Mean)) + 
    geom_point(alpha=0.5) + 
    stat_smooth(method='lm',formula=y~x) + 
    xlab("Log Frequency") + 
    ylab(expression(paste("Estimated ", lambda))) + 
    theme_bw()

Start Time: s

r = with(freq, cor.test(log(freq), start_Mean))
message(paste('Correlation with Log Frequency: ', r$estimate, ', p=', r$p.value, sep=''))
## Correlation with Log Frequency: 0.0383836163574097, p=0.476037437866545
ggplot(freq, aes(x=log(freq), y=start_Mean)) + 
    geom_point(alpha=0.5) + 
    stat_smooth(method='lm',formula=y~x) + 
    xlab("Log Frequency") + 
    ylab(expression(paste("Estimated ", 'start time (months)'))) + 
    theme_bw()

Comparison of Variance Explained by Waiting for Data vs. Start Time

bootByHand = function(df, iters) {
    RESULTS = NULL
    for (i in 1:iters) {
        N = length(df$language)
        inx = sample(1:N, N, replace=T)
        set = df[inx,]
        
        R.t = var(set$k_Mean / set$lambda_Mean)
        R.a = var(set$start_Mean) + var(set$k_Mean / set$lambda_Mean)
        PropVar = R.t / R.a

        set$t = set$k_Mean / set$lambda_Mean
        set$a = set$start_Mean + set$t
        PropTime = set$t / set$a
        
        result = data.frame(PropVar = PropVar, PropTime = PropTime)
        RESULTS = rbind(RESULTS, result)
    }
    return(RESULTS)
}

propBootC = ddply(d, .(language), bootByHand, 1000)
prC = ddply(propBootC, .(language), summarise, lowVar = quantile(PropVar, probs=0.05), lowT = quantile(PropTime, probs=0.05), 
            highVar = quantile(PropVar, probs=0.95), highT = quantile(PropTime, probs=0.95))

get_prop = function(d) {
    R.s.comp = var(d$start_Mean)
    R.t.comp = var(d$k_Mean / d$lambda_Mean)
    R.a.comp = var(d$start_Mean) + var(d$k_Mean / d$lambda_Mean)
    Vars=R.t.comp/R.a.comp
    
    set = d
    set$t = set$k_Mean / set$lambda_Mean
    set$a = set$start_Mean + set$t
    propTime = mean(set$t / set$a)
    
    return(data.frame(PropV=Vars, PropT=propTime))
}

proC = ddply(d, .(language), get_prop)
propC = merge(prC, proC)

ggplot(propC, aes(x=language, y=PropV, fill=language)) +
    geom_bar(stat='identity', position=position_dodge(0.9)) +
    geom_errorbar(aes(ymin=lowVar,ymax=highVar), position=position_dodge(0.9), width=0.3) +
    geom_point(aes(x=language, y=PropT), shape=2) +
    geom_pointrange(aes(ymin=lowT, ymax=highT)) +
    ylab('Percent') +
    xlab('') +
    guides(fill=F) +
    ggtitle('Comprehension') +
    theme_bw() +
    theme(axis.text.x=element_text(angle=90))

Comprehension First Half Data

# Load in the data
data = read.csv('Comprehension/First_Half_Comprehension_Summary.csv')
converge = read.csv('Comprehension/First_Half_Comprehension_Convergence.csv')
converge$converged = "No"
converge$converged[converge$Rhat_k < 1.2 & converge$Rhat_l < 1.2 & converge$Rhat_s < 1.2] = 'Yes'

message(paste('Percent Chains Converged:', 100*dim(converge[converge$converged=='Yes',])[1] / dim(converge)[1]))
## Percent Chains Converged: 96.3860369609856
d = merge(data, converge[c('language','item.id','converged')])
d = subset(d, converged=='Yes')

Check Predictive Ability

pred = merge(mcdi, d[,c('language', 'item.id', 'k_Mean', 'lambda_Mean', 'start_Mean')])

PRED = NULL
for(q in split(pred, paste(pred$language, pred$item))) {
    train <- subset(q, age<median(age))
    test  <- subset(q, age>=median(age))
    train$successes = train$produces * train$n
    train$failures = train$n - train$successes
    
    g.Probit <- glm( cbind(successes, failures) ~ age, family=binomial(link="probit"), data=train)
    g.Logit  <- glm( cbind(successes, failures) ~ age, family=binomial(link="logit"),  data=train)
            
    PRED <- rbind(PRED, data.frame(language=q$language[1], item.id=q$item.id[1], item=q$item[1], category=q$category[1], 
                                   k=q$k_Mean[1], l=q$lambda_Mean[1], s=q$start_Mean[1], obs=test$produces, n=q$n[1], age=test$age,
                                    Gamma=pgamma(test$age-q$start_Mean[1], shape=q$k_Mean[1], rate=q$lambda_Mean[1]),
                                    Probit=predict(g.Probit, newdata=test, type="response"),
                                    Logit=predict(g.Logit, newdata=test, type="response")
                                    ))
}

pred.m <- melt(PRED, id.vars=c('language', 'item.id', 'age', 'obs', 'n', 'category', 'item', 'k', 'l', 's'))

mycor.test <- function(df) {
           cr <- cor.test(df$value, df$obs) 
           data.frame(estimate=cr$estimate, ymin=cr$conf.int[1], ymax=cr$conf.int[2])
 }

m <- ddply( pred.m, c("language", "variable"), mycor.test)

m$Model <- m$variable

ggplot(m, aes(y=estimate, x=language, color=Model, ymin=ymin, ymax=ymax)) + 
        geom_point(position=position_dodge(.2)) + 
        geom_errorbar(position=position_dodge(.2),width=2) + 
        ylim(0,1) + theme_bw() +
        theme(axis.text.x = element_text(angle = 90)) + 
        xlab("") + ylab("Coef. of Determination") +
        ggtitle('Comprehension')