# Our solution to HW1 # TIMTOWTDI ;-) # read the data and attach (this changes from computer # to computer, of course) setwd("/Users/baroni/Desktop/potsdam/lectures/data") BNC <- read.delim("bnc_metadata.tbl") attach(BNC) # question 1 dim(BNC) # how many metadata? it depends on how you define metadata, # but if you consider each column, id's included, to be # metadata, then the number of metadata is the same as the # number of columns, i.e., 31 # question 2 # how many genres length(levels(genre)) ## you could also use length(summary(genre)) or length(table(genre)) # the smallest ones head(sort(table(genre))) # question 3 ## on a German Windows PC, you have to use write.csv2(), because ## Excel expects a different CSV format there; otherwise use write.csv() write.csv2(BNC,"bnc.csv",row.names=FALSE) # question 4 # summarise the distribution of text lengths summary(n_w) summary(n_s) # outliers in written-to-be-spoken boxplot(n_words[text_type=="written-to-be-spoken"]) boxplot(n_words[text_type=="written-to-be-spoken"])$out # can you list the titles of the outlier texts? ## threshold of 20000 obtained visually from box plot title[ text_type == "written-to-be-spoken" & n_words <= 20000 ] # question 5 # do text lengths differ between text types? boxplot(n_words ~ text_type, ylim=c(0,100000)) # do they differ between male and female authors? boxplot(n_words ~ author_sex, ylim=c(0,1e+05)) # or boxplot(n_words[author_sex=="male"], n_words[author_sex=="female"], names=c("male","female")) # question 6 # produce a subset of the metadata table containing only texts # for which author sex is known... temp <- subset(BNC,author_sex=="male" | author_sex=="female") # ... omitting the title and irrelevant metadata columns (esp. # those which have only a single value in the subset)... # let's check which columns have this property: summary(temp) # they appear to be: # context interaction_type mode region respondent_age respondent_class respondent_sex # subsetting: male.female <- subset(temp, select=-c(title, context, interaction_type, mode, region, respondent_age, respondent_class, respondent_sex)) # question 7 plot(n_words, n_s, xlim=c(0,100000), ylim=c(0,10000)) cor.test(n_words, n_s) # corr coeff: 0.8687635 # conf intervals: 0.8610003; 0.8761220 # relation does not appear to be linear: plot strongly # suggests that different sets of documents follow # different patterns, some linerar, some not