rspList=sorted(glob.glob('./data/*'))
df=[]
for rsp in rspList:
data=pd.read_csv(rsp)
df.append(data)
df=pd.concat(df)
stoplist= set('i am you are he she is a for of the and to in'.split())
sents=df['translated_feedback'][df['translated_feedback']!='\\N'] #remove no response
texts=[[word for word in sent.translate(trans_table).lower().split()
if word not in stoplist] for sent in sents.values] #remove stopwords and punctuation
texts=list(filter(None,texts)) #filter empty list
#print(texts)
feq=defaultdict(int)
for text in texts:
for token in text:
feq[token]+=1
texts=[[token for token in text if feq[token]>1] for text in texts] #remove low frequency
dic=corpora.Dictionary(texts) #build dictionary
dic.save('./dictionary.dict')
#print(dic)
corpus =[dic.doc2bow(text) for text in texts] #build bag of words corpus
corpora.MmCorpus.serialize('./corpus.mm',corpus)
#print(corpus)
when i want to get id2token in dic, it is empty dictionary {}
dic.id2token
{}
I have to traverse(iterate) the dic ones
for k,v in dic.items():
pass
dic.id2token
{0:'yes',1:'got',2:'it'}
沒有留言:
張貼留言