Production All Data

# Load in the data
data = read.csv('Production/All_Data_Production_Summary.csv')
mcdi = read.csv('Production/production-mcdi.csv')
converge = read.csv('Production/All_Data_Production_Convergence.csv')
converge$converged = "No"
converge$converged[converge$Rhat_k < 1.2 & converge$Rhat_l < 1.2 & converge$Rhat_s < 1.2] = 'Yes'

message(paste('Percent Chains Converged:', 100*dim(converge[converge$converged=='Yes',])[1] / dim(converge)[1]))
## Percent Chains Converged: 100
d = merge(data, converge[c('language','item.id','converged')])
d = subset(d, converged=='Yes')

Check Model Fit

pred = merge(mcdi, d[,c('language', 'item.id', 'k_Mean', 'lambda_Mean', 'start_Mean')])

PRED = NULL
for(q in split(pred, paste(pred$language, pred$item))) {
    train <- q
    test  <- q
    train$successes = train$produces * train$n
    train$failures = train$n - train$successes
    
    g.Probit <- glm( cbind(successes, failures) ~ age, family=binomial(link="probit"), data=train)
    g.Logit  <- glm( cbind(successes, failures) ~ age, family=binomial(link="logit"),  data=train)
            
    PRED <- rbind(PRED, data.frame(language=q$language[1], item.id=q$item.id[1], item=q$item[1], category=q$category[1], 
                                   k=q$k_Mean[1], l=q$lambda_Mean[1], s=q$start_Mean[1], obs=test$produces, n=q$n[1], age=test$age,
                                    Gamma=pgamma(test$age-q$start_Mean[1], shape=q$k_Mean[1], rate=q$lambda_Mean[1]),
                                    Probit=predict(g.Probit, newdata=test, type="response"),
                                    Logit=predict(g.Logit, newdata=test, type="response")
                                    ))
}

pred.m <- melt(PRED, id.vars=c('language', 'item.id', 'age', 'obs', 'n', 'category', 'item', 'k', 'l', 's'))

mycor.test <- function(df) {
           cr <- cor.test(df$value, df$obs) 
           data.frame(estimate=cr$estimate, ymin=cr$conf.int[1], ymax=cr$conf.int[2])
 }

m <- ddply( pred.m, c("language", "variable"), mycor.test)

m$Model <- m$variable

ggplot(m, aes(y=estimate, x=language, color=Model, ymin=ymin, ymax=ymax)) + 
        geom_point() + geom_errorbar() + 
        ylim(0,1) + theme_bw() +
        theme(axis.text.x = element_text(angle = 90)) + 
        xlab("") + ylab("Coef. of Determination")

Check Parameter Values

# Remove k_Mean > 3000, which are all syntactic structures not lexical items
d = subset(d, k_Mean < 3000)
mean = ddply(d, .(language), summarise, k_M=mean(k_Mean), k_low=mean(k_HDIlow), k_high=mean(k_HDIhigh), 
                                        l_M=mean(lambda_Mean), l_low=mean(lambda_HDIlow), l_high=mean(lambda_HDIhigh), 
                                        s_M=mean(start_Mean), s_low=mean(start_HDIlow), s_high=mean(start_HDIhigh))
mean
##     language       k_M     k_low    k_high       l_M      l_low    l_high
## 1  Cantonese 13.384701 3.2131192 23.814307 0.6012640 0.25130280 0.9471624
## 2   Croatian 12.089559 1.9733039 22.971015 0.5404920 0.16511126 0.9355322
## 3     Danish 10.700966 5.2738375 16.929480 0.4903437 0.32540963 0.6683131
## 4    English 11.428233 5.4411342 17.699654 0.5317706 0.34118729 0.7198872
## 5     German 14.772612 4.1255255 24.655968 0.6743865 0.32403650 1.0002087
## 6     Hebrew 16.012298 0.7462316 35.734628 0.8398933 0.08735008 1.7611053
## 7    Italian  8.142294 1.6986016 16.241610 0.4102736 0.16649108 0.6749154
## 8   Mandarin  7.758743 1.4633166 17.956013 0.5776843 0.25345863 0.9864020
## 9  Norwegian  5.021387 3.2914092  7.124552 0.3091743 0.24764179 0.3782119
## 10   Russian  5.830934 1.3340862 11.931618 0.2894680 0.12136921 0.4810541
## 11   Spanish  7.135218 2.3896737 11.663059 0.2918068 0.13546774 0.4447380
## 12   Swedish 15.214969 2.9555671 28.549485 0.7045917 0.25139306 1.1653782
## 13   Turkish  8.024480 3.6727073 12.395066 0.3449581 0.21699089 0.4689106
##          s_M        s_low   s_high
## 1   5.894536  0.063793113 13.05651
## 2   5.771750  0.013516278 13.55681
## 3   8.155522  3.120217691 12.83247
## 4   5.422185  0.802269179 10.29357
## 5   5.335078  0.024957415 12.53934
## 6   5.258709  0.003696322 13.20732
## 7   8.502092  0.636813716 15.88387
## 8   9.850170  2.779015974 15.06770
## 9  12.557992 10.061623405 14.68695
## 10  9.458866  1.453883746 16.20147
## 11  4.653678  0.017270995 11.01608
## 12  6.603426  0.196792073 13.83729
## 13  5.289779  0.265590647 10.81956

Number of ELIs: k

ggplot(d, aes(x=language, y=k_Mean, fill=language)) + 
    geom_boxplot() + 
    ylab("Estimated k") + xlab("") + ylim(0, 65) +
    guides(fill=F) + ggtitle('Production') +
    theme_bw() +
    theme(axis.text.x=element_text(angle=90))

Rate of ELIs: \(\lambda\)

ggplot(d, aes(x=language, y=lambda_Mean, fill=language)) + 
    geom_boxplot() + 
    ylab(expression(paste("Estimated ", lambda))) + xlab("") +
    guides(fill=F) + ggtitle('Production') + ylim(0, 3) +
    theme_bw() +
    theme(axis.text.x=element_text(angle=90))

Start Time: s

ggplot(d, aes(x=language, y=start_Mean, fill=language)) + 
    geom_boxplot() + 
    ylab("Estimated start (months)") + xlab("") +
    guides(fill=F) + ggtitle('Production') +
    theme_bw() + ylim(0,20) +
    theme(axis.text.x=element_text(angle=90))

Parameter Values by MCDI Category for English

eng = subset(d, language=='English')

Number of ELIs: k

ggplot(eng, aes(x=category, y=k_Mean, fill=category)) + 
    geom_boxplot() + 
    ylab("Estimated mean k") + xlab("") +
    guides(fill=F) + ggtitle('Production') +
    theme_bw() +
    theme(axis.text.x=element_text(angle=90))

Rate of ELIs: \(\lambda\)

ggplot(eng, aes(x=category, y=lambda_Mean, fill=category)) + 
    geom_boxplot() + 
    ylab(expression(paste("Estimated mean ",lambda))) + xlab("") +
    guides(fill=F) + ggtitle('Production') +
    theme_bw() +
    theme(axis.text.x=element_text(angle=90))

Start Time: s

ggplot(eng, aes(x=category, y=start_Mean, fill=category)) + 
    geom_boxplot() + 
    ylab("Estimated mean start time (months)") + xlab("") +
    guides(fill=F) + ggtitle('Production') +
    theme_bw() +
    theme(axis.text.x=element_text(angle=90))

Correlating Parameter Values with Childes Frequency

childes <- read.table("Production/childes.txt", header=F)
names(childes) <- c("item", "freq")
freq <- merge(eng, childes)

Number of ELIs: k

r = with(freq, cor.test(log(freq), k_Mean))
message(paste('Correlation with Log Frequency: ', r$estimate, ', p=', r$p.value, sep=''))
## Correlation with Log Frequency: 0.194982456814472, p=1.49143990786666e-06
ggplot(freq, aes(x=log(freq), y=k_Mean)) + 
    geom_point(alpha=0.5) + 
    stat_smooth(method='lm',formula=y~x) + 
    xlab("Log Frequency") + 
    ylab(expression(paste("Estimated ", k))) + 
    theme_bw()

Rate of ELIs: \(\lambda\)

r = with(freq, cor.test(log(freq), lambda_Mean))
message(paste('Correlation with Log Frequency: ', r$estimate, ', p=', r$p.value, sep=''))
## Correlation with Log Frequency: 0.315992381475656, p=2.22542944734414e-15
ggplot(freq, aes(x=log(freq), y=lambda_Mean)) + 
    geom_point(alpha=0.5) + 
    stat_smooth(method='lm',formula=y~x) + 
    xlab("Log Frequency") + 
    ylab(expression(paste("Estimated ", lambda))) + 
    theme_bw()

Start Time: s

r = with(freq, cor.test(log(freq), start_Mean))
message(paste('Correlation with Log Frequency: ', r$estimate, ', p=', r$p.value, sep=''))
## Correlation with Log Frequency: -0.218282049687356, p=6.63045744244183e-08
ggplot(freq, aes(x=log(freq), y=start_Mean)) + 
    geom_point(alpha=0.5) + 
    stat_smooth(method='lm',formula=y~x) + 
    xlab("Log Frequency") + 
    ylab(expression(paste("Estimated ", 'start time (months)'))) + 
    theme_bw()

Comparison of Variance Explained by Waiting for Data vs. Start Time

bootByHand = function(df, iters) {
    RESULTS = NULL
    for (i in 1:iters) {
        N = length(df$language)
        inx = sample(1:N, N, replace=T)
        set = df[inx,]
        
        R.t = var(set$k_Mean / set$lambda_Mean)
        R.a = var(set$start_Mean) + var(set$k_Mean / set$lambda_Mean)
        PropVar = R.t / R.a

        set$t = set$k_Mean / set$lambda_Mean
        set$a = set$start_Mean + set$t
        PropTime = set$t / set$a
        
        result = data.frame(PropVar = PropVar, PropTime = PropTime)
        RESULTS = rbind(RESULTS, result)
    }
    return(RESULTS)
}

propBootP = ddply(d, .(language), bootByHand, 1000)
prP = ddply(propBootP, .(language), summarise, lowVar = quantile(PropVar, probs=0.05), lowT = quantile(PropTime, probs=0.05), 
            highVar = quantile(PropVar, probs=0.95), highT = quantile(PropTime, probs=0.95))

get_prop = function(d) {
    R.s.comp = var(d$start_Mean)
    R.t.comp = var(d$k_Mean / d$lambda_Mean)
    R.a.comp = var(d$start_Mean) + var(d$k_Mean / d$lambda_Mean)
    Vars=R.t.comp/R.a.comp
    
    set = d
    set$t = set$k_Mean / set$lambda_Mean
    set$a = set$start_Mean + set$t
    propTime = mean(set$t / set$a)
    
    return(data.frame(PropV=Vars, PropT=propTime))
}

proP = ddply(d, .(language), get_prop)
propP = merge(prP, proP)

ggplot(propP, aes(x=language, y=PropV, fill=language)) +
    geom_bar(stat='identity', position=position_dodge(0.9)) +
    geom_errorbar(aes(ymin=lowVar,ymax=highVar), position=position_dodge(0.9), width=0.3) +
    geom_point(aes(x=language, y=PropT), shape=2) +
    geom_pointrange(aes(ymin=lowT, ymax=highT)) +
    ylab('Percent') +
    xlab('') +
    guides(fill=F) +
    ggtitle('Production') +
    theme_bw() +
    theme(axis.text.x=element_text(angle=90))

Production First Half Data

# Load in the data
data = read.csv('Production/First_Half_Production_Summary.csv')
converge = read.csv('Production/First_Half_Production_Convergence.csv')
converge$converged = "No"
converge$converged[converge$Rhat_k < 1.2 & converge$Rhat_l < 1.2 & converge$Rhat_s < 1.2] = 'Yes'

message(paste('Percent Chains Converged:', 100*dim(converge[converge$converged=='Yes',])[1] / dim(converge)[1]))
## Percent Chains Converged: 99.1908713692946
d = merge(data, converge[c('language','item.id','converged')])
d = subset(d, converged=='Yes')

Check Predictive Ability

pred = merge(mcdi, d[,c('language', 'item.id', 'k_Mean', 'lambda_Mean', 'start_Mean')])

PRED = NULL
for(q in split(pred, paste(pred$language, pred$item))) {
    train <- subset(q, age<median(age))
    test  <- subset(q, age>=median(age))
    train$successes = train$produces * train$n
    train$failures = train$n - train$successes
    
    g.Probit <- glm( cbind(successes, failures) ~ age, family=binomial(link="probit"), data=train)
    g.Logit  <- glm( cbind(successes, failures) ~ age, family=binomial(link="logit"),  data=train)
            
    PRED <- rbind(PRED, data.frame(language=q$language[1], item.id=q$item.id[1], item=q$item[1], category=q$category[1], 
                                   k=q$k_Mean[1], l=q$lambda_Mean[1], s=q$start_Mean[1], obs=test$produces, n=q$n[1], age=test$age,
                                    Gamma=pgamma(test$age-q$start_Mean[1], shape=q$k_Mean[1], rate=q$lambda_Mean[1]),
                                    Probit=predict(g.Probit, newdata=test, type="response"),
                                    Logit=predict(g.Logit, newdata=test, type="response")
                                    ))
}

pred.m <- melt(PRED, id.vars=c('language', 'item.id', 'age', 'obs', 'n', 'category', 'item', 'k', 'l', 's'))

mycor.test <- function(df) {
           cr <- cor.test(df$value, df$obs) 
           data.frame(estimate=cr$estimate, ymin=cr$conf.int[1], ymax=cr$conf.int[2])
 }

m <- ddply( pred.m, c("language", "variable"), mycor.test)

m$Model <- m$variable

ggplot(m, aes(y=estimate, x=language, color=Model, ymin=ymin, ymax=ymax)) + 
        geom_point(position=position_dodge(.2)) + 
        geom_errorbar(position=position_dodge(.2),width=2) + 
        ylim(0,1) + theme_bw() +
        theme(axis.text.x = element_text(angle = 90)) + 
        xlab("") + ylab("Coef. of Determination") +
        ggtitle('Production')