##
## Our solution to HW assignment 2
##

## Problem 1: German NP and PP constructions

library(zipfR)  # start by loading the zipfR library

## - Import the German NP and PP data sets. 
data(TigerNP.spc, TigerPP.spc)

N(TigerNP.spc)  # check which data set is larger
N(TigerPP.spc)

## alternatively, use N from empirical vgc

N1 <- N(TigerNP.spc) # NP data set is larger, so we will plot curves up to N1 = N(TigerNP.spc)
N.values <- seq(0, N1, steps=100) # sequence with specified number of steps between endpoints

## - For the data set with more tokens, compute a binomially interpolated vocabulary growth curve (VGC).
NP.vgc <- vgc.interp(TigerNP.spc, N.values) # binomial interpolation (be patient!)

## - For the smaller one, estimate an LNRE model ...
PP.gigp <- lnre("gigp", spc=TigerPP.spc)    # estimate GIGP model on PP data
summary(PP.gigp)  # note the excellent goodness-of-fit (fZM is even better)

## - ... and use it to compute an expected VGC up to the size of the larger data set.
PP.vgc <- lnre.vgc(PP.gigp, N.values)

## - Plot the interpolated and extrapolated VGCs, and determine which of the two constructions appears to be more productive
# vertical line (N0 parameter) indicates size of smaller sample (PP data), i.e. where extrapolation with LNRE model starts
plot(NP.vgc, PP.vgc, legend=c("NP","PP"), N0=N(TigerPP.spc))

# Conclusion: both curves have similar shape, but the NP curve produces consistenly more types, so the NP rules seem to be more productive than PP rules.

# An interesting extension is to compare the interpolated / model-based growth curves with actual vocabulary growth (for NP and PP rules) in the Tiger corpus
data(TigerNP.emp.vgc, TigerPP.emp.vgc)
plot(NP.vgc, PP.vgc, TigerNP.emp.vgc, TigerPP.emp.vgc, legend=c("NP (interpolated)", "PP (GIGP)", "NP (actual)", "PP (actual)"))

# Actual PP growth seems to be somewhat steeper than predicted by the model, possibly because of non-randomness in the data.  As a consequence, the true divergence of vocabulary growth curves may not be as large as suggested by our analysis (in fact, PP and NP curves almost seem to run parallel in the range from N=60,000 to N=90,000).


## Problem 2: Data lost with cut-off points

## - First, load the file bigrams.100k.tfl (containing bigrams extracted from the first 100,000 tokens of the Brown corpus)
bigrams.tfl <- read.tfl("bigrams.100k.tfl") # don't forget to set the working directory first
bigrams.spc <- tfl2spc(bigrams.tfl)         # compute frequency spectrum

## - What proportion of bigram types occur only once? What proportion of types occur 4 times or less?
print(bigrams.spc)  # you can also calculate the proportions "by hand" from the printout
plot(bigrams.spc)   # visualise the frequency spectrum

Vm(bigrams.spc, 1) / V(bigrams.spc)  # proportion of bigram types occurring once
sum(Vm(bigrams.spc, 1:4)) / V(bigrams.spc) # note how Vm() takes a vector to return V1, V2, V3, V4

## - Now, suppose that we want to use our data to estimate the proportion of bigram types that would be lost by using the same two frequency cut-offs if we had a 1 million word corpus.  How will you go about it?
bigrams.zm <- lnre("zm", spc=bigrams.spc)  # estimate LNRE model
# Baroni & Evert (2007) suggest that ZM model might give the best extrapolation accuracy if there is non-randomness, even though GIGP has much better goodness-of-fit

# now we can use the model to predict relevant information for a sample of 1M tokens
N <- 10 * N(bigrams.spc)  # number of bigram tokens is smaller than number of words, so we can't just set N = 1e6 in order to make predictions for corpus of 1M tokens; instead, we estimate the number of bigram tokens in the 1M word corpus as 10 times the number of bigram tokens in the 100k corpus
## although if you did use 1e6, that's ok for the hw!

EVm(bigrams.zm, 1, N) / EV(bigrams.zm, N)
sum(EVm(bigrams.zm, 1:4, N)) / EV(bigrams.zm, N)
# note that the type proportions are somewhat smaller than for 100k tokens

# Final remark: comparison with the actual frequency spectrum for bigrams from the full 1M word Brown corpus shows that the predictions made by ZM are indeed much better than those of GIGP, especially for the proportion of hapax legomena (true value: 74.7%, ZM predicts 80.5%, GIGP predicts 65.4%).