Think in yasam: R tm

when using

library(tm)

library(tm.plugin.mail)



#'corpus' folder in current path, 'SPAM' folder in 'corpus'

cname <- file.path(".", "corpus","SPAM")



# readMail from "tm.plugin.mail" can remove some heads of email

docs <- Corpus(DirSource(cname),readerControl = list(reader = readMail))

#docs <- Corpus(DirSource(cname)) #read mail as text

for (j in seq(docs))
{
#change encoding from "iso-8859-1" to "UTF-8"
docs[[j]]<- iconv(docs[[j]],"iso-8859-1","UTF-8")

    # Transforms

    docs[[j]] <- gsub("/", " ", docs[[j]])                      #transform "/" to " "
    docs[[j]] <- gsub("@", " ", docs[[j]])
    docs[[j]] <- gsub("\\|", " ", docs[[j]])

    docs[[j]] <- gsub("specific transform", "ST", docs[[j]])
    docs[[j]] <- gsub("other specific transform", "OST", docs[[j]])

}

docs <- tm_map(docs, tolower, iconv(enc2utf8(docs), sub = "byte"))  #lowercase
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, removeWords, c("own", "stop", "words", "oil"))
docs <- tm_map(docs, stripWhitespace)

docs <- tm_map(docs, stemDocument) #stemming

# Document term matrix.
dtm <- DocumentTermMatrix(docs) # documents as the rows,  terms as the columns
#dtm <- DocumentTermMatrix(docs, control=list(weighting = weightTfIdf))     # tfidf

Think in yasam

2015年1月14日星期三

R tm

沒有留言:

張貼留言

2015年1月14日 星期三

R tm

沒有留言:

張貼留言

2015年1月14日星期三