nltk 获取 gutenberg 语料
gensim 生成词库和 onehot 编码

正在尝试基于 Tensorflow LSTM 模型开发另外一个项目,需要自然语言处理的工具和语料。


import nltkimport numpy as npfrom nltk.corpus import gutenbergfrom gensim import corpora, models, similaritiesclass Book2Array( ):    sentences=None    token2id_dic=None    def __init__(self,sentences):        self.sentences=sentences        self.token2id_dic=self.get_token2id_dic()    def get_sentences(self):        #macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')        #print(macbeth_sentences)        #print(type(macbeth_sentences))        print(len(macbeth_sentences))        sentences_list=[sentence for sentence in self.sentences]        #print(type(macbeth_list))        return sentences_list    def get_token2id_dic(self):        # collect statistics about all tokens        dictionary = corpora.Dictionary(self.sentences)        # remove stop words and words that appear only once        dictionary.compactify() # remove gaps in id sequence after words that were removed        print(len(dictionary))        token2id_dic=dictionary.token2id        return token2id_dic    def word2onehot(self,word):        onehot_list=np.zeros(8192)        onehot_list[self.token2id_dic[word]]=1        return onehot_list    def sent2vec(self,sentence):        vec=[]        if(len(sentence)>20):            sentence=sentence[0:20]        for word in sentence:            onehot_list=self.word2onehot(word)            vec.append(onehot_list)        len_vec=len(vec)        for i in range(0,20-len_vec):            vec.append(np.zeros(8192))        #print(len(vec))        vec_np=np.asarray(vec)        return vec_np    def sentences2array(self):        array=[]        for sentence in self.sentences:            array.append(self.sent2vec(sentence))        return array    def gen_batch(self):        passif __name__ == '__main__':    macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')    book_array=Book2Array(macbeth_sentences)    book_array.get_sentences()    array=book_array.sentences2array()    np_array=np.array(array[0])    print(np_array.shape)

更多教程:http://www.tensorflownews.com/
收藏 打印