library(tm)
library(tm.plugin.mail)
#'corpus' folder in current path, 'SPAM' folder in 'corpus'
cname <- file.path(".", "corpus","SPAM")
# readMail from "tm.plugin.mail" can remove some heads of emaildocs <- Corpus(DirSource(cname),readerControl = list(reader = readMail))
#docs <- Corpus(DirSource(cname)) #read mail as text
for (j in seq(docs))
{
#change encoding from "iso-8859-1" to "UTF-8"
docs[[j]]<- iconv(docs[[j]],"iso-8859-1","UTF-8")
# Transforms
docs[[j]] <- gsub("/", " ", docs[[j]]) #transform "/" to " "
docs[[j]] <- gsub("@", " ", docs[[j]])
docs[[j]] <- gsub("\\|", " ", docs[[j]])
docs[[j]] <- gsub("specific transform", "ST", docs[[j]])
docs[[j]] <- gsub("other specific transform", "OST", docs[[j]])
}
docs <- tm_map(docs, tolower, iconv(enc2utf8(docs), sub = "byte")) #lowercase
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, removeWords, c("own", "stop", "words", "oil"))
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, stemDocument) #stemming
# Document term matrix.
dtm <- DocumentTermMatrix(docs) # documents as the rows, terms as the columns
#dtm <- DocumentTermMatrix(docs, control=list(weighting = weightTfIdf)) # tfidf
沒有留言:
張貼留言