## Perform an MDS analysis of the Italian NN compound data, based on
## (scaled versions) of the sets of cues we described in class.

## Read the data in and scale them:

d <- read.table("comp.stats.txt",header=TRUE)
scaled <- scale(d[,4:9])


## 1. MDS operates on a distance matrix, a symmetric matrix of 
## distances between each point in the data-set and each other 
## point. Thus, the first thing you will need to do is to generate
## a distance matrix from the cue matrix. Look at the documentation
## for the dist() function, and use it to generate distance matrices
## using two different methods to compute distance. 

## default is Euclidean distance:

euc.mat <- dist(scaled)

## we also try another distance measure, e.g., maximum distance:

max.mat <- dist(scaled,method="maximum")


## 2. In order to perform MDS, you will use the cmdscale() function:
## take a look at its documentation, and run MDS on each of your 
## distance matrices

euc.mds <- cmdscale(euc.mat)
max.mds <- cmdscale(max.mat)

## by default, we get two coordinates:

summary(euc.mds)
summary(max.mds)


## 3. Plot the compounds in the first two dimensions produced by 
## the MDS analyses, using different colours for relational and 
## attributive compounds. 

## a compact way to do it:

plot(euc.mds,col=as.numeric(d$TYPE))
plot(max.mds,col=as.numeric(d$TYPE))

## alternatively:

plot(euc.mds,type="n")
points(euc.mds[d$TYPE=="at",],col="black")
points(euc.mds[d$TYPE=="re",],col="red")

plot(max.mds,type="n")
points(max.mds[d$TYPE=="at",],col="black")
points(max.mds[d$TYPE=="re",],col="red")


## 4. Try k-means clustering on the MDS outputs, and look at 
## performance by cross-tabulating the clusters and the 
## relational/attributive labels.

## we cluster in two dimensions

euc.km <- kmeans(euc.mds,2,nstart=10)
max.km <- kmeans(max.mds,2,nstart=10)

## results with Euclidean distance comparable to those
## obtained with PCA, whereas Maximum distance is completely
## off:

table(euc.km$cluster,d$TYPE)
table(max.km$cluster,d$TYPE)