本实践采用的是IWLST TED演讲en_zh数据集,基于tensorflow 实战google深度学习框架(第二版)一书所进行的实验,大部分代码直接是用的书上的,为了更好地巩固知识,所以整理成博客。
首先从相关网站上下载数据集,解压,代码如下所示:
wget https://wit3.fbk.eu/archive/2015-01//texts/en/zh/en-zh.tgz
tar xzvf en-zh.tgz
cd en-zh/
我们只关注train.tags.en-zh.en和train.tags.en-zh.zh两个文本数据文件,但是都带有html标记,需要进行预处理一下:
IWSLT15.TED.dev2010.en-zh.en. IWSLT15.TED.tst2011.en-zh.en. IWSLT15.TED.tst2013.en-zh.en. train.tags.en-zh.zh
IWSLT15.TED.dev2010.en-zh.zh. IWSLT15.TED.tst2011.en-zh.zh. IWSLT15.TED.tst2013.en-zh.zh. train.zh
IWSLT15.TED.tst2010.en-zh.en. IWSLT15.TED.tst2012.en-zh.en. README
IWSLT15.TED.tst2010.en-zh.zh. IWSLT15.TED.tst2012.en-zh.zh. train.tags.en-zh.en
在对原始数据进行处理时,需要进行分词、建库和数字化等操作,在将数据处理成可以输入的数据时还得进行padding。在这里,本人选取的中文和英文的分词工具都是stanfordcorenlp,相关知识请参考这篇博客。下面直接上代码:
#coding:utf-8
import collections
from operator import itemgetter
from stanfordcorenlp import StanfordCoreNLP
import tqdm
#第一步,把英文跟中文弄成一行一句的格式
def deletehtml(filename1,filename2):
f1 = open(filename1,\'r\')
f2 = open(filename2,\'r\')
data1 = f1.readlines()
data2 = f2.readlines()
assert len(data1)==len(data2)#用codecs会导致报错不知道为什么
fw1 = open(filename1+\".deletehtml\",\'w\')
fw2 = open(filename2+\".deletehtml\",\'w\')
print(\"deletehtml...\")
for line1,line2 in tqdm.tqdm(zip(data1,data2)):
line1 = line1.strip()
line2 = line2.strip()
if line1 and line2:
if \'<\' not in line1 and \'>\' not in line1 and \'<\' not in line2 and \'>\' not in line2:
fw1.write(line1+\"\\n\")
fw2.write(line2+\"\\n\")
fw1.close()
f1.close()
fw2.close()
f2.close()
return filename1+\".deletehtml\",filename2+\".deletehtml\"
#第二步,分词并建立词库
def segement_sentence(filename,vocab_size,lang=\'en\'):
nlp = StanfordCoreNLP(\"../stanford-corenlp-full-2018-10-05\",lang=lang)
with open(filename,\'r\') as f:
data = f.readlines()
counter = collections.Counter()
f1 = open(filename+\".segment\",\'w\')
print(\"segmenting...\")
for line in tqdm.tqdm(data):
line = line.strip()
word_list = nlp.word_tokenize(line.strip())
sentence = \' \'.join(word_list)
f1.write(sentence+\"\\n\")
for word in word_list:
counter[word] += 1
f1.close()
nlp.close()
sorted_word_to_cnt = sorted(counter.items(),key=itemgetter(1),reverse=True)
sorted_words = [\"<unk>\",\"<sos>\",\"<eos>\"] + [x[0] for x in sorted_word_to_cnt]
if len(sorted_words)>vocab_size:
sorted_words = sorted_words[:vocab_size]
assert len(sorted_words)<=vocab_size
with open(filename+\".vocab\",\'w\') as fw:
for word in sorted_words:
fw.write(word+\"\\n\")
return filename+\".segment\"
#第三步,将文本转换成数字编号
def convert_to_id(filename,vocab_file):
with open(vocab_file,\"r\") as f:
data = f.readlines()
vocab = [w.strip() for w in data]
word_to_id = {k:v for (k,v) in zip(vocab,range(len(vocab)))}
with open(filename,\"r\") as f:
data = f.readlines()
f1 = open(filename+\".id\",\'w\')
print(\"converting...\")
for line in tqdm.tqdm(data):
words = line.strip().split()+[\"<eos>\"]
ids = \' \'.join([str(word_to_id[word])
if word in word_to_id else str(word_to_id[\"<unk>\"])
for word in words])
f1.write(ids+\"\\n\")
f1.close()
return filename+\".id\"
def main():
src = \"train.tags.en-zh.en\"#有html标记
trg = \"train.tags.en-zh.zh\"#同
src_vocab_size = 10000
trg_vocab_size = 4000
src1,trg1 = deletehtml(src,trg)
src2 = segement_sentence(src1,src_vocab_size,lang=\'en\')
trg2 = segement_sentence(trg1,trg_vocab_size,lang=\'zh\')
src3 = convert_to_id(src+\".deletehtml.segment\",src+\".deletehtml.vocab\")
trg3 = convert_to_id(trg+\".deletehtml.segment\",trg+\".deletehtml.vocab\")
if __name__ == \'__main__\':
main()
使用上述代码对数据文件进行处理后,我们就得到了每一行一句话,一句话都转成编号的形式,句子的某尾加<eos>标志。①但是由于每句话的长短不一,需要将一个batch中的较短句子的长度padding至最长句子的长度,这样才能使encoder正常工作;②在decoder阶段,由于输入需要以<sos>开头,那么还得将目标语言再进行处理一下,x y z <eos>变为<sos> x y z。直接上完整的训练代码:
#coding:utf-8
import tensorflow as tf
MAX_LEN = 50
SOS_ID = 1
SRC_TRAIN_DATA = \"../train.tags.en-zh.en.deletehtml.segment.id\"
TRG_TRAIN_DATA = \"../train.tags.en-zh.zh.deletehtml.segment.id\"
CHECKPOINT_PATH = \"./seq2seq_ckpt\"
HIDDEN_SIZE = 1024
NUM_ S = 2
SRC_VOCAB_SIZE = 10000
TRG_VOCAB_SIZE = 4000
BATCH_SIZE = 100
NUM_EPOCH = 5
KEEP_PROB = 0.8
MAX_GRAD_NORM = 5
SHARE_EMB_AND_SOFTMAX = True
class NMTModel( ):
def __init__(self):
self.enc_cell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE)\\
for _ in range(NUM_ S)])
self.dec_cell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE)\\
for _ in range(NUM_ S)])
self.src_ ding = tf.get_variable(
\"src_emb\",[SRC_VOCAB_SIZE,HIDDEN_SIZE])
self.trg_ ding = tf.get_variable(
\"trg_emb\",[TRG_VOCAB_SIZE,HIDDEN_SIZE])
if SHARE_EMB_AND_SOFTMAX:
self.softmax_weight = tf.transpose(self.trg_ ding)
else:
self.softmax_weight = tf.get_variable(\"weight\",[HIDDEN_SIZE,TRG_VOCAB_SIZE])
self.softmax_bias = tf.get_variable(\"softmax_bias\",[TRG_VOCAB_SIZE])
def forward(self,src_input,src_size,trg_input,trg_label,trg_size):
batch_size = tf.shape(src_input)[0]
src_emb = tf.nn. ding_lookup(self.src_ ding,src_input)
trg_emb = tf.nn. ding_lookup(self.trg_ ding,trg_input)
src_emb = tf.nn.dropout(src_emb,KEEP_PROB)
trg_emb = tf.nn.dropout(trg_emb,KEEP_PROB)
with tf.variable_scope(\"encoder\"):
enc_outputs,enc_state = tf.nn.dynamic_rnn(
self.enc_cell,src_emb,src_size,dtype=tf.float32)
with tf.variable_scope(\"decoder\"):
dec_outputs, _ = tf.nn.dynamic_rnn(
self.dec_cell,trg_emb,trg_size,initial_state=enc_state)
output = tf.reshape(dec_outputs,[-1,HIDDEN_SIZE])
logits = tf.matmul(output,self.softmax_weight) + self.softmax_bias
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape(trg_label,[-1]),logits=logits)
label_weights = tf.sequence_mask(trg_size,maxlen=tf.shape(trg_label)[1],dtype=tf.float32)
label_weights = tf.reshape(label_weights,[-1])
cost = tf.reduce_sum(loss*label_weights)
cost_per_token = cost / tf.reduce_sum(label_weights)
trainable_variables = tf.trainable_variables()
grads = tf.gradients(cost / tf.to_float(batch_size), trainable_variables)
grads,_ = tf.clip_by_global_norm(grads,MAX_GRAD_NORM)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
train_op = optimizer.apply_gradients(zip(grads,trainable_variables))
return cost_per_token,train_op
def run_epoch(session,cost_op,train_op,saver,step):
while True:
try:
cost,_ = session.run([cost_op,train_op])
if step%10 == 0:
print(\"steps %d, per token cost is %.3f\"%(step,cost))
if step%200 == 0:
saver.save(session,CHECKPOINT_PATH,global_step=step)
step += 1
except tf.errors.OutOfRangeError:
break
return step
def MakeDataset(file_path):
dataset = tf.data.TextLineDataset(file_path)
dataset = dataset.map(lambda string: tf.string_split([string]).values)
dataset = dataset.map(lambda string: tf.string_to_number(string,tf.int32))
dataset = dataset.map(lambda x: (x,tf.size(x)))
return dataset
def MakeSrcTrgDataset(src_path,trg_path,batch_size):
src_data = MakeDataset(src_path)
trg_data = MakeDataset(trg_path)
dataset = tf.data.Dataset.zip((src_data,trg_data))
def FilterLength(src_tuple,trg_tuple):
((src_input,src_len),(trg_label,trg_len)) = (src_tuple,trg_tuple)
src_len_ok = tf.logical_and(tf.greater(src_len,1),tf.less_equal(src_len,MAX_LEN))
trg_len_ok = tf.logical_and(tf.greater(trg_len,1),tf.less_equal(trg_len,MAX_LEN))
return tf.logical_and(src_len_ok,trg_len_ok)
dataset = dataset.filter(FilterLength)
def MakeTrgInput(src_tuple,trg_tuple):
((src_input,src_len),(trg_label,trg_len)) = (src_tuple,trg_tuple)
trg_input = tf.concat([[SOS_ID],trg_label[:-1]],axis=0)
return ((src_input,src_len),(trg_input,trg_label,trg_len))
dataset = dataset.map(MakeTrgInput)
dataset = dataset.shuffle(10000)
padded_shapes = (
(tf.TensorShape([None]),
tf.TensorShape([])),
(tf.TensorShape([None]),
tf.TensorShape([None]),
tf.TensorShape([])))
batched_dataset = dataset.padded_batch(batch_size,padded_shapes)
return batched_dataset
def main():
initializer = tf.random_uniform_initializer(-0.05,0.05)
with tf.variable_scope(\"nmt_model\",reuse=None,initializer=initializer):
train_model = NMTModel()
data = MakeSrcTrgDataset(SRC_TRAIN_DATA,TRG_TRAIN_DATA,BATCH_SIZE)
iterator = data.make_initializable_iterator()
(src,src_size),(trg_input,trg_label,trg_size) = iterator.get_next()
cost_op,train_op = train_model.forward(src,src_size,trg_input,trg_label,trg_size)
saver = tf.train.Saver()
step = 0
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7,allow_growth=True)
session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
with session as sess:
tf.global_variables_initializer().run()
for i in range(NUM_EPOCH):
print(\"In iteration: %d\"%(i+1))
sess.run(iterator.initializer)
step = run_epoch(sess,cost_op,train_op,saver,step)
if __name__ == \'__main__\':
main()
在对新句子进行inference时,需要确定解码时的条件,因此要用到tf.while_loop函数,这里直接给出预测代码:
#coding:utf-8
import tensorflow as tf
CHECKPOINT_PATH = \"./seq2seq_ckpt-9000\"
HIDDEN_SIZE = 1024
NUM_ S = 2
SRC_VOCAB_SIZE = 10000
TRG_VOCAB_SIZE = 4000
BATCH_SIZE = 100
SHARE_EMB_AND_SOFTMAX = True
SOS_ID = 1
EOS_ID = 2
class NMTModel( ):
def __init__(self):
self.enc_cell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE)\\
for _ in range(NUM_ S)])
self.dec_cell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE)\\
for _ in range(NUM_ S)])
self.src_ ding = tf.get_variable(
\"src_emb\",[SRC_VOCAB_SIZE,HIDDEN_SIZE])
self.trg_ ding = tf.get_variable(
\"trg_emb\",[TRG_VOCAB_SIZE,HIDDEN_SIZE])
if SHARE_EMB_AND_SOFTMAX:
self.softmax_weight = tf.transpose(self.trg_ ding)
else:
self.softmax_weight = tf.get_variable(\"weight\",[HIDDEN_SIZE,TRG_VOCAB_SIZE])
self.softmax_bias = tf.get_variable(\"softmax_bias\",[TRG_VOCAB_SIZE])
def inference(self,src_input):
src_size = tf.convert_to_tensor([len(src_input)],dtype=tf.int32)
src_input = tf.convert_to_tensor([src_input],dtype=tf.int32)
src_emb = tf.nn. ding_lookup(self.src_ ding,src_input)
with tf.variable_scope(\"encoder\"):
enc_outputs,enc_state = tf.nn.dynamic_rnn(
self.enc_cell,src_emb,src_size,dtype=tf.float32)
MAX_DEC_LEN = 100
with tf.variable_scope(\"decoder/rnn/multi_rnn_cell\"):
init_array = tf.TensorArray(dtype=tf.int32,size=0,dynamic_size=True,clear_after_read=False)
init_array = init_array.write(0,SOS_ID)
init_loop_var = (enc_state,init_array,0)
def continue_loop_condition(state,trg_ids,step):
return tf.reduce_all(tf.logical_and(tf.not_equal(trg_ids.read(step),EOS_ID),tf.less(step,MAX_DEC_LEN-1)))
def loop_body(state,trg_ids,step):
trg_input = [trg_ids.read(step)]
trg_emb = tf.nn. ding_lookup(self.trg_ ding,trg_input)
dec_outputs,next_state = self.dec_cell.call(state=state,inputs=trg_emb)
output = tf.reshape(dec_outputs,[-1,HIDDEN_SIZE])
logits = (tf.matmul(output,self.softmax_weight) + self.softmax_bias)
next_id = tf.argmax(logits,axis=1,output_type=tf.int32)
trg_ids = trg_ids.write(step+1,next_id[0])
return next_state,trg_ids,step+1
state,trg_ids,step = tf.while_loop(
continue_loop_condition,loop_body,init_loop_var)
return trg_ids.stack()
def main():
from stanfordcorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP(\"../../stanford-corenlp-full-2018-10-05\",lang=\'en\')
with tf.variable_scope(\"nmt_model\",reuse=None):
model = NMTModel()
vocab_file = \"../train.tags.en-zh.en.deletehtml.vocab\"
sentence = \"It is very beautiful!\"
with open(vocab_file,\'r\') as f:
data = f.readlines()
words = [w.strip() for w in data]
word_to_id = {k:v for (k,v) in zip(words,range(len(words)))}
wordlist = nlp.word_tokenize(sentence.strip()) + [\"<eos>\"]
# print(wordlist)
idlist = [str(word_to_id[w]) if w in word_to_id else str(word_to_id[\"<unk>\"]) for w in wordlist]
idlist = [int(i) for i in idlist]
# print(idlist)
output_op = model.inference(idlist)
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7,allow_growth=True)
session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
saver = tf.train.Saver()
saver.restore(session,CHECKPOINT_PATH)
output = session.run(output_op)
vocab_file2 = \"../train.tags.en-zh.zh.deletehtml.vocab\"
with open(vocab_file2,\'r\') as f2:
data2 = f2.readlines()
words = [w.strip() for w in data2]
id_to_word = {k:v for (k,v) in zip(range(len(words)),words)}
print([id_to_word[i] for i in output])
session.close()
nlp.close()
if __name__ == \'__main__\':
main()
下面给出预测结果:
[\'<sos>\', \'这\', \'是\', \'非常\', \'美丽\', \'的\', \'!\', \'<eos>\']
由于不想让篇幅过长,所以直接上干货,具体原理请参考开头所说的书,有疑问请在下面留言,我的软件配置是python3.6.5 + tensorflow-gpu==1.12 + cuda9.0。
继续阅读与本文标签相同的文章
上一篇 :
前端练习25 实现一个EventEmitter
下一篇 :
王者荣耀AI即将上线,队友再也不用担心你掉线了
-
谷歌搜索广告出价方式
2026-05-18栏目: 教程
-
印度5G建设即将开始,是屈服于美国的施压,还是选择跟华为合作?
2026-05-18栏目: 教程
-
系列文章:云原生Kubernetes日志落地方案
2026-05-18栏目: 教程
-
QQ浏览器正孵化“用户增长团队”,解读中国浏览器行业发展趋势
2026-05-18栏目: 教程
-
Java并发系列(4)java关键字-synchronized
2026-05-18栏目: 教程
