2015年1月14日 星期三

R tm

when using



library(tm)
library(tm.plugin.mail)

#'corpus' folder in current path, 'SPAM' folder in 'corpus'
cname <- file.path(".", "corpus","SPAM")

# readMail from "tm.plugin.mail" can remove some heads of email
docs <- Corpus(DirSource(cname),readerControl = list(reader = readMail))

#docs <- Corpus(DirSource(cname)) #read mail as text

for (j in seq(docs))
{
    #change encoding from "iso-8859-1" to "UTF-8"
    docs[[j]]<- iconv(docs[[j]],"iso-8859-1","UTF-8")

    # Transforms
    docs[[j]] <- gsub("/", " ", docs[[j]])                      #transform "/" to " "
    docs[[j]] <- gsub("@", " ", docs[[j]])
    docs[[j]] <- gsub("\\|", " ", docs[[j]])
    docs[[j]] <- gsub("specific transform", "ST", docs[[j]])
    docs[[j]] <- gsub("other specific transform", "OST", docs[[j]])
}

docs <- tm_map(docs, tolower, iconv(enc2utf8(docs), sub = "byte"))  #lowercase
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, removeWords, c("own", "stop", "words", "oil"))
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, stemDocument) #stemming
# Document term matrix.
dtm <- DocumentTermMatrix(docs) # documents as the rows,  terms as the columns
#dtm <- DocumentTermMatrix(docs, control=list(weighting = weightTfIdf))     # tfidf

沒有留言:

張貼留言