##
## Our solution to Exercise #4 (surface collocations and written/spoken keywords)
##

## Question 1. How to install add-on packages from CRAN

# a) use package installer in Windows / Mac OS X GUI

# b) download appropriate file from CRAN, then install from command-line with
#      R CMD INSTALL <package_file>

# c) install using R function install.packages()
install.packages("corpora")
# see ?install.packages for more options

## Question 2. Load library "corpora" and data sets, read data set documentation
library(corpora)
?BNCInChargeOf
data(BNCInChargeOf)
?BNCcomparison
data(BNCcomparison)

## Question 3. Contingency tables for surface cooccurrences.

# According to lecture slides, first row represents word tokens within collocations spans,
# second row word tokens outside collocational spans (for given node word, here the phrase
# "in charge of"). For each collocate W, the first column of the corresponding contingency
# table contains the occurrences of W, and the second column contains all other tokens.

# This leads to the following equations for the observed frequencies O11, O12, O21, O22
COLL <- transform(BNCInChargeOf,
                  O11 = as.numeric(f.in), O12 = as.numeric(N.in - f.in),
                  O21 = as.numeric(f.out), O22 = as.numeric(N.out - f.out))
# note that we also convert the new variables to floating-point format, to avoid integer overflow

## Question 4. Marginal frequencies, sample size, expected frequencies, association measures.

# calculate marginal frequencies and sample size as in lecture slides
COLL <- transform(COLL,
                  R1 = O11 + O12, R2 = O21 + O22,
                  C1 = O11 + O21, C2 = O12 + O22,
                  N = O11 + O12 + O21 + O22)

summary(COLL$R1)  # should always be same value (N.in)
summary(COLL$R2)  # should always be same value (N.out)
summary(COLL$N)   # should always be same value (corpus size)

# calculate expected frequencies
COLL <- transform(COLL,
                  E11 = R1 * C1 / N, E12 = R1 * C2 / N,
                  E21 = R2 * C1 / N, E22 = R2 * C2 / N)

all.equal(COLL$E11 + COLL$E12 + COLL$E21 + COLL$E22, COLL$N) # check consistency

# calculate association scores: here we use the Dice coefficient for illustration
COLL <- transform(COLL,
                  Dice = 2 * O11 / (R1 + C1))
# (equation from www.collocations.de/AM)

# now rank data set by Dice scores (could also annotate ranks, which helps to compare different measures)
idx.Dice <- order(COLL$Dice, decreasing=TRUE)
COLL.Dice <- COLL[idx.Dice, ]
head(COLL.Dice[, c("collocate", "f.in", "f.out", "Dice")], 20) # select only relevant columns for better readability

# TMTOWDTI -- lecture slides show other ways for sorting the data set, calculating rankings and extracting n-best lists

## Question 5/6. Keyword identification.

# written and spoken sample sizes (NB: "OTHER" entry for all other nouns allows us to do this)
N.written <- sum(BNCcomparison$written)
N.spoken <- sum(BNCcomparison$spoken)

# Contingency table for frequency comparison, as explained in lecture slides:
#  - first column contains data for spoken sample, second column for written sample
#  - first row contains frequency counts for given noun in spoken/written sample
#  - second row contains sample size - frequency count
# NB: we have swapped order of columns, because some association measures work better
#     if the smaller sample is in the first column (as is the case for cooccurrence data)
KEY <- transform(BNCcomparison,
                 O11 = as.numeric(spoken), O21 = as.numeric(N.spoken - spoken),
                 O12 = as.numeric(written), O22 = as.numeric(N.written - written))

## Question 7. Marginal frequencies, N, expected frequencies, association measures (as above)

# calculate marginal frequencies and "total sample size" N as above
KEY <- transform(KEY,
                 R1 = O11 + O12, R2 = O21 + O22,
                 C1 = O11 + O21, C2 = O12 + O22,
                 N = O11 + O12 + O21 + O22)

summary(KEY$C1)  # should always be same value (spoken sample size)
summary(KEY$C2)  # should always be same value (written sample size)
summary(KEY$N)   # should always be same value ("total sample size")

# calculate expected frequencies
KEY <- transform(KEY,
                 E11 = R1 * C1 / N, E12 = R1 * C2 / N,
                 E21 = R2 * C1 / N, E22 = R2 * C2 / N)

all.equal(KEY$E11 + KEY$E12 + KEY$E21 + KEY$E22, KEY$N) # check consistency

# we don't really want to calculate keyness for the "OTHER" entry, so delete it
KEY <- subset(KEY, noun != "OTHER")

# calculate association scores: here we use MI as some researchers did in terminology extraction
KEY <- transform(KEY,
                 MI = log2(O11 / E11))
# (equation from www.collocations.de/AM)

# now rank data set by MI scores and list entries with
#  - large positive scores (spoken keywords, i.e. higher relative frequency in first column)
#  - large negative scores (written keywords, i.e. higher relative frequency in second column)
idx.MI <- order(KEY$MI, decreasing=TRUE)
KEY.MI <- KEY[idx.MI, ]
head(KEY.MI[, c("noun", "written", "spoken", "O11", "E11", "MI")], 20) 
tail(KEY.MI[, c("noun", "written", "spoken", "O11", "E11", "MI")], 20)