36-462/662
Lecture 3, 4 September 2019
source("http://www.stat.cmu.edu/~cshalizi/dm/19/hw/01/nytac-and-bow.R")
music.stories <- read.directory("nyt_corpus/music")
art.stories <- read.directory("nyt_corpus/art")
art.BoW.list <- lapply(art.stories, table)
music.BoW.list <- lapply(music.stories, table)
nyt.BoW.frame <- make.BoW.frame(c(art.BoW.list, music.BoW.list), row.names = c(paste("art",
1:length(art.BoW.list), sep = "."), paste("music", 1:length(music.BoW.list),
sep = ".")))
dim(nyt.BoW.frame)
## [1] 102 4431
query.by.similarity <- function(query, BoW.frame) {
query.vec = strip.text(query)
query.BoW = table(query.vec)
lexicon = colnames(BoW.frame)
query.vocab = names(query.BoW)
query.lex = query.BoW[intersect(query.vocab, lexicon)]
query.lex[setdiff(lexicon, query.vocab)] = 0
query.lex = query.lex[lexicon]
q = t(as.matrix(query.lex))
idf = get.idf.weights(BoW.frame)
BoW = scale.cols(BoW.frame, idf)
q = q * idf
BoW = div.by.euc.length(BoW)
q = q/sqrt(sum(q^2))
best.index = nearest.points(q, BoW)$which
best.name = rownames(BoW)[best.index]
return(list(best.index = best.index, best.name = best.name))
}
get.idf.weights <- function(x) {
doc.freq <- colSums(x > 0)
doc.freq[doc.freq == 0] <- 1
w <- log(nrow(x)/doc.freq)
return(w)
}
## $best.index
## [1] 96
##
## $best.name
## [1] "music.39"
## [1] "perched" "five" "stories" "above" "columbus"
## [6] "circle" "in" "the" "time" "warner"
## [11] "center" "rafael" "vi" "olys" "new"
## [16] "design" "for" "jazz" "at" "lincoln"
## [21] "center" "has" "a" "cool" "ethereality"
## $best.index
## [1] 30
##
## $best.name
## [1] "art.30"
## [1] "xl" "xavier" "laboulbenne" "gallery" "#"
## [6] "west" "#nd" "street" "chelsea" "through"
## [11] "feb" "#popular" "culture" "may" "be"
## [16] "the" "mainspring" "for" "a" "lot"
## [21] "of" "new" "art" "but" "it"
# Which story is in which class?
story.classes <- c(rep("art", times = length(art.stories)), rep("music", times = length(music.stories)))
nyt.similarities <- distances(div.by.euc.length(idf.weight(nyt.BoW.frame)))
NNs <- nearest.points(nyt.BoW.frame, d = nyt.similarities)$which
NN.classes <- story.classes[NNs]
# Average error rate
mean(NN.classes != story.classes)
## [1] 0.1862745
## NN.classes
## story.classes art music
## art 45 12
## music 7 38
Say “art” is positive and “music” is negative
What’s the false positive rate, i.e., the probability that a story about music will be falsely classified as about art?
What’s the false negative rate?
What’s the positive predictive value, i.e., the probability that a story classified as “art” is actually about art?
## [1] 102 4431
prototypical.art <- colMeans(nyt.BoW.normed.idf[story.classes == "art", ])
prototypical.music <- colMeans(nyt.BoW.normed.idf[story.classes == "music",
])
prototypes <- rbind(prototypical.art, prototypical.music)
prototype.matches <- nearest.points(nyt.BoW.normed.idf, prototypes)$which
prototype.classes <- c("art", "music")[prototype.matches]
mean(prototype.classes != story.classes)
## [1] 0
## prototype.classes
## story.classes art music
## art 57 0
## music 0 45
(Don’t expect the prototype method to always out-perform nearest neighbors!)
## [1] 102 4431
\(n\) data points, 2 classes
Mitchell, Melanie. 1996. An Introduction to Genetic Algorithms. Cambridge, Massachusetts: MIT Press.
Shalizi, Cosma Rohilla. 2009. “Dynamics of Bayesian Updating with Dependent Data and Misspecified Models.” Electronic Journal of Statistics 3:1039–74. https://doi.org/10.1214/09-EJS485.
Sperber, Dan, and Deirdre Wilson. 1995. Relevance: Cognition and Communication. Second. Oxford: Basil Blackwell.
Sutton, Richard S., and Andrew G. Barto. 1998. Reinforcement Learning: An Introduction. Cambridge, Massachusetts: MIT Press. http://www.cs.ualberta.ca/~sutton/book/the-book.html.