nltk 获取 gutenberg 语料
gensim 生成词库和 onehot 编码
更多教程:http://www.tensorflownews.com/
gensim 生成词库和 onehot 编码
正在尝试基于 Tensorflow LSTM 模型开发另外一个项目,需要自然语言处理的工具和语料。
import nltkimport numpy as npfrom nltk.corpus import gutenbergfrom gensim import corpora, models, similaritiesclass Book2Array( ): sentences=None token2id_dic=None def __init__(self,sentences): self.sentences=sentences self.token2id_dic=self.get_token2id_dic() def get_sentences(self): #macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt') #print(macbeth_sentences) #print(type(macbeth_sentences)) print(len(macbeth_sentences)) sentences_list=[sentence for sentence in self.sentences] #print(type(macbeth_list)) return sentences_list def get_token2id_dic(self): # collect statistics about all tokens dictionary = corpora.Dictionary(self.sentences) # remove stop words and words that appear only once dictionary.compactify() # remove gaps in id sequence after words that were removed print(len(dictionary)) token2id_dic=dictionary.token2id return token2id_dic def word2onehot(self,word): onehot_list=np.zeros(8192) onehot_list[self.token2id_dic[word]]=1 return onehot_list def sent2vec(self,sentence): vec=[] if(len(sentence)>20): sentence=sentence[0:20] for word in sentence: onehot_list=self.word2onehot(word) vec.append(onehot_list) len_vec=len(vec) for i in range(0,20-len_vec): vec.append(np.zeros(8192)) #print(len(vec)) vec_np=np.asarray(vec) return vec_np def sentences2array(self): array=[] for sentence in self.sentences: array.append(self.sent2vec(sentence)) return array def gen_batch(self): passif __name__ == '__main__': macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt') book_array=Book2Array(macbeth_sentences) book_array.get_sentences() array=book_array.sentences2array() np_array=np.array(array[0]) print(np_array.shape)更多教程:http://www.tensorflownews.com/
继续阅读与本文标签相同的文章
上一篇 :
nltk 中的 sents 和 words
下一篇 :
tensorflow rnn 最简单实现代码
-
移动深度学习 Mobile-deep-learning(MDL)
2026-05-26栏目: 教程
-
Serpent.AI - 游戏代理框架(Python)
2026-05-26栏目: 教程
-
face-alignment:用 pytorch 实现的 2D 和 3D 人脸对齐库
2026-05-26栏目: 教程
-
ZhuSuan 是建立在Tensorflow上的贝叶斯深层学习的 python 库
2026-05-26栏目: 教程
-
pix2code:从截图生成图形用户界面代码
2026-05-26栏目: 教程
