本实践采用的是IWLST TED演讲en_zh数据集,基于tensorflow 实战google深度学习框架(第二版)一书所进行的实验,大部分代码直接是用的书上的,为了更好地巩固知识,所以整理成博客。

首先从相关网站上下载数据集,解压,代码如下所示:

wget https://wit3.fbk.eu/archive/2015-01//texts/en/zh/en-zh.tgz
tar xzvf en-zh.tgz
cd en-zh/

我们只关注train.tags.en-zh.en和train.tags.en-zh.zh两个文本数据文件,但是都带有html标记,需要进行预处理一下:

IWSLT15.TED.dev2010.en-zh.en.   IWSLT15.TED.tst2011.en-zh.en.   IWSLT15.TED.tst2013.en-zh.en.   train.tags.en-zh.zh
IWSLT15.TED.dev2010.en-zh.zh.   IWSLT15.TED.tst2011.en-zh.zh.   IWSLT15.TED.tst2013.en-zh.zh.   train.zh
IWSLT15.TED.tst2010.en-zh.en.   IWSLT15.TED.tst2012.en-zh.en.   README
IWSLT15.TED.tst2010.en-zh.zh.   IWSLT15.TED.tst2012.en-zh.zh.   train.tags.en-zh.en

在对原始数据进行处理时,需要进行分词、建库和数字化等操作,在将数据处理成可以输入的数据时还得进行padding。在这里,本人选取的中文和英文的分词工具都是stanfordcorenlp,相关知识请参考这篇博客。下面直接上代码:

#coding:utf-8
import collections
from operator import itemgetter
from stanfordcorenlp import StanfordCoreNLP
import tqdm

#第一步,把英文跟中文弄成一行一句的格式
def deletehtml(filename1,filename2):
	f1 = open(filename1,\'r\')
	f2 = open(filename2,\'r\') 

	data1 = f1.readlines()
	data2 = f2.readlines()
	assert len(data1)==len(data2)#用codecs会导致报错不知道为什么
	fw1 = open(filename1+\".deletehtml\",\'w\')
	fw2 = open(filename2+\".deletehtml\",\'w\')

	print(\"deletehtml...\")

	for line1,line2 in tqdm.tqdm(zip(data1,data2)):
		line1 = line1.strip()
		line2 = line2.strip()
		if line1 and line2:
			if \'<\' not in line1 and \'>\' not in line1 and \'<\' not in line2 and \'>\' not in line2:
				fw1.write(line1+\"\\n\")
				fw2.write(line2+\"\\n\")
	fw1.close()
	f1.close()
	fw2.close()
	f2.close()

	return filename1+\".deletehtml\",filename2+\".deletehtml\"

#第二步,分词并建立词库
def segement_sentence(filename,vocab_size,lang=\'en\'):
	nlp = StanfordCoreNLP(\"../stanford-corenlp-full-2018-10-05\",lang=lang)
	with open(filename,\'r\') as f:
		data = f.readlines()
		counter = collections.Counter()
		f1 = open(filename+\".segment\",\'w\')
		print(\"segmenting...\")
		for line in tqdm.tqdm(data):
			line = line.strip()
			word_list = nlp.word_tokenize(line.strip())
			sentence = \' \'.join(word_list)
			f1.write(sentence+\"\\n\")
			for word in word_list:
				counter[word] += 1
		f1.close()
	nlp.close()

	sorted_word_to_cnt = sorted(counter.items(),key=itemgetter(1),reverse=True)
	sorted_words =  [\"<unk>\",\"<sos>\",\"<eos>\"] + [x[0] for x in sorted_word_to_cnt] 

	if len(sorted_words)>vocab_size:
		sorted_words = sorted_words[:vocab_size]
	assert len(sorted_words)<=vocab_size
	with open(filename+\".vocab\",\'w\') as fw:
		for word in sorted_words:
			fw.write(word+\"\\n\")
	return filename+\".segment\"

#第三步,将文本转换成数字编号
def convert_to_id(filename,vocab_file):
	with open(vocab_file,\"r\") as f:
		data = f.readlines()
		vocab = [w.strip() for w in data]
	word_to_id = {k:v for (k,v) in zip(vocab,range(len(vocab)))}

	with open(filename,\"r\") as f:
		data = f.readlines()
		f1 = open(filename+\".id\",\'w\')
		print(\"converting...\")
		for line in tqdm.tqdm(data):
			words = line.strip().split()+[\"<eos>\"]
			ids = \' \'.join([str(word_to_id[word]) 
				if word in word_to_id else str(word_to_id[\"<unk>\"]) 
				for word in words])
			f1.write(ids+\"\\n\")
		f1.close()
	return filename+\".id\"

def main():
	src = \"train.tags.en-zh.en\"#有html标记
	trg = \"train.tags.en-zh.zh\"#同
	src_vocab_size = 10000
	trg_vocab_size = 4000

	src1,trg1 = deletehtml(src,trg)
	

	src2 = segement_sentence(src1,src_vocab_size,lang=\'en\')
	trg2 = segement_sentence(trg1,trg_vocab_size,lang=\'zh\')

	src3 = convert_to_id(src+\".deletehtml.segment\",src+\".deletehtml.vocab\")
	trg3 = convert_to_id(trg+\".deletehtml.segment\",trg+\".deletehtml.vocab\")

if __name__ == \'__main__\':
	main()

使用上述代码对数据文件进行处理后,我们就得到了每一行一句话,一句话都转成编号的形式,句子的某尾加<eos>标志。①但是由于每句话的长短不一,需要将一个batch中的较短句子的长度padding至最长句子的长度,这样才能使encoder正常工作;②在decoder阶段,由于输入需要以<sos>开头,那么还得将目标语言再进行处理一下,x y z <eos>变为<sos> x y z。直接上完整的训练代码:

#coding:utf-8
import tensorflow as tf

MAX_LEN = 50
SOS_ID = 1

SRC_TRAIN_DATA = \"../train.tags.en-zh.en.deletehtml.segment.id\"
TRG_TRAIN_DATA = \"../train.tags.en-zh.zh.deletehtml.segment.id\"
CHECKPOINT_PATH = \"./seq2seq_ckpt\"

HIDDEN_SIZE = 1024
NUM_ S = 2
SRC_VOCAB_SIZE = 10000
TRG_VOCAB_SIZE = 4000
BATCH_SIZE = 100
NUM_EPOCH = 5
KEEP_PROB = 0.8
MAX_GRAD_NORM = 5
SHARE_EMB_AND_SOFTMAX = True

class NMTModel( ):
	def __init__(self):
		self.enc_cell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE)\\
		 for _ in range(NUM_ S)])
		self.dec_cell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE)\\
		 for _ in range(NUM_ S)])

		self.src_ ding = tf.get_variable(
			\"src_emb\",[SRC_VOCAB_SIZE,HIDDEN_SIZE])
		self.trg_ ding = tf.get_variable(
			\"trg_emb\",[TRG_VOCAB_SIZE,HIDDEN_SIZE])
		
		if SHARE_EMB_AND_SOFTMAX:
			self.softmax_weight = tf.transpose(self.trg_ ding)
		else:
			self.softmax_weight = tf.get_variable(\"weight\",[HIDDEN_SIZE,TRG_VOCAB_SIZE])
		self.softmax_bias = tf.get_variable(\"softmax_bias\",[TRG_VOCAB_SIZE])

	def forward(self,src_input,src_size,trg_input,trg_label,trg_size):
		batch_size = tf.shape(src_input)[0]
		src_emb = tf.nn. ding_lookup(self.src_ ding,src_input)
		trg_emb = tf.nn. ding_lookup(self.trg_ ding,trg_input)

		src_emb = tf.nn.dropout(src_emb,KEEP_PROB)
		trg_emb = tf.nn.dropout(trg_emb,KEEP_PROB)

		with tf.variable_scope(\"encoder\"):
			enc_outputs,enc_state = tf.nn.dynamic_rnn(
				self.enc_cell,src_emb,src_size,dtype=tf.float32)

		with tf.variable_scope(\"decoder\"):
			dec_outputs, _ = tf.nn.dynamic_rnn(
				self.dec_cell,trg_emb,trg_size,initial_state=enc_state)

		output = tf.reshape(dec_outputs,[-1,HIDDEN_SIZE])
		logits = tf.matmul(output,self.softmax_weight) + self.softmax_bias
		loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape(trg_label,[-1]),logits=logits)

		label_weights = tf.sequence_mask(trg_size,maxlen=tf.shape(trg_label)[1],dtype=tf.float32)
		label_weights = tf.reshape(label_weights,[-1])

		cost = tf.reduce_sum(loss*label_weights)
		cost_per_token = cost / tf.reduce_sum(label_weights)

		trainable_variables = tf.trainable_variables()

		grads = tf.gradients(cost / tf.to_float(batch_size), trainable_variables)
		grads,_ = tf.clip_by_global_norm(grads,MAX_GRAD_NORM)
		optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
		train_op = optimizer.apply_gradients(zip(grads,trainable_variables))

		return cost_per_token,train_op

def run_epoch(session,cost_op,train_op,saver,step):
	while True:
		try:
			cost,_ = session.run([cost_op,train_op])
			if step%10 == 0:
				print(\"steps %d, per token cost is %.3f\"%(step,cost))
			if step%200 == 0:
				saver.save(session,CHECKPOINT_PATH,global_step=step)
			step += 1
		except tf.errors.OutOfRangeError:
			break
	return step


def MakeDataset(file_path):
	dataset = tf.data.TextLineDataset(file_path)
	dataset = dataset.map(lambda string: tf.string_split([string]).values)
	dataset = dataset.map(lambda string: tf.string_to_number(string,tf.int32))
	dataset = dataset.map(lambda x: (x,tf.size(x)))
	return dataset

def MakeSrcTrgDataset(src_path,trg_path,batch_size):
	src_data = MakeDataset(src_path)
	trg_data = MakeDataset(trg_path)

	dataset = tf.data.Dataset.zip((src_data,trg_data))

	def FilterLength(src_tuple,trg_tuple):
		((src_input,src_len),(trg_label,trg_len)) = (src_tuple,trg_tuple)
		src_len_ok = tf.logical_and(tf.greater(src_len,1),tf.less_equal(src_len,MAX_LEN))
		trg_len_ok = tf.logical_and(tf.greater(trg_len,1),tf.less_equal(trg_len,MAX_LEN))
		return tf.logical_and(src_len_ok,trg_len_ok)
	dataset = dataset.filter(FilterLength)

	def MakeTrgInput(src_tuple,trg_tuple):
		((src_input,src_len),(trg_label,trg_len)) = (src_tuple,trg_tuple)
		trg_input = tf.concat([[SOS_ID],trg_label[:-1]],axis=0)
		return ((src_input,src_len),(trg_input,trg_label,trg_len))
	dataset = dataset.map(MakeTrgInput)
	dataset = dataset.shuffle(10000)

	padded_shapes = (
		(tf.TensorShape([None]),
		 tf.TensorShape([])),
		(tf.TensorShape([None]),
		 tf.TensorShape([None]),
		 tf.TensorShape([])))
	batched_dataset = dataset.padded_batch(batch_size,padded_shapes)
	return batched_dataset
				

def main():
	initializer = tf.random_uniform_initializer(-0.05,0.05)
	with tf.variable_scope(\"nmt_model\",reuse=None,initializer=initializer):
		train_model = NMTModel()

	data = MakeSrcTrgDataset(SRC_TRAIN_DATA,TRG_TRAIN_DATA,BATCH_SIZE)
	iterator = data.make_initializable_iterator()
	(src,src_size),(trg_input,trg_label,trg_size) = iterator.get_next()

	cost_op,train_op = train_model.forward(src,src_size,trg_input,trg_label,trg_size)
	saver = tf.train.Saver()
	step = 0

	gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7,allow_growth=True)
	session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

	with session as sess:
		tf.global_variables_initializer().run()
		for i in range(NUM_EPOCH):
			print(\"In iteration: %d\"%(i+1))
			sess.run(iterator.initializer)
			step = run_epoch(sess,cost_op,train_op,saver,step)

if __name__ == \'__main__\':
	main()

在对新句子进行inference时,需要确定解码时的条件,因此要用到tf.while_loop函数,这里直接给出预测代码:

#coding:utf-8
import tensorflow as tf


CHECKPOINT_PATH = \"./seq2seq_ckpt-9000\"

HIDDEN_SIZE = 1024
NUM_ S = 2
SRC_VOCAB_SIZE = 10000
TRG_VOCAB_SIZE = 4000
BATCH_SIZE = 100
SHARE_EMB_AND_SOFTMAX = True
SOS_ID = 1
EOS_ID = 2

class NMTModel( ):
	def __init__(self):
		self.enc_cell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE)\\
		 for _ in range(NUM_ S)])
		self.dec_cell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE)\\
		 for _ in range(NUM_ S)])

		self.src_ ding = tf.get_variable(
			\"src_emb\",[SRC_VOCAB_SIZE,HIDDEN_SIZE])
		self.trg_ ding = tf.get_variable(
			\"trg_emb\",[TRG_VOCAB_SIZE,HIDDEN_SIZE])
		
		if SHARE_EMB_AND_SOFTMAX:
			self.softmax_weight = tf.transpose(self.trg_ ding)
		else:
			self.softmax_weight = tf.get_variable(\"weight\",[HIDDEN_SIZE,TRG_VOCAB_SIZE])
		self.softmax_bias = tf.get_variable(\"softmax_bias\",[TRG_VOCAB_SIZE])

	def inference(self,src_input):
		src_size = tf.convert_to_tensor([len(src_input)],dtype=tf.int32)
		src_input = tf.convert_to_tensor([src_input],dtype=tf.int32)
		src_emb = tf.nn. ding_lookup(self.src_ ding,src_input)

		with tf.variable_scope(\"encoder\"):
			enc_outputs,enc_state = tf.nn.dynamic_rnn(
				self.enc_cell,src_emb,src_size,dtype=tf.float32)
		MAX_DEC_LEN = 100

		with tf.variable_scope(\"decoder/rnn/multi_rnn_cell\"):
			init_array = tf.TensorArray(dtype=tf.int32,size=0,dynamic_size=True,clear_after_read=False)
			init_array = init_array.write(0,SOS_ID)

			init_loop_var = (enc_state,init_array,0)

			def continue_loop_condition(state,trg_ids,step):
				return tf.reduce_all(tf.logical_and(tf.not_equal(trg_ids.read(step),EOS_ID),tf.less(step,MAX_DEC_LEN-1)))

			def loop_body(state,trg_ids,step):
				trg_input = [trg_ids.read(step)]
				trg_emb = tf.nn. ding_lookup(self.trg_ ding,trg_input)

				dec_outputs,next_state = self.dec_cell.call(state=state,inputs=trg_emb)
				output = tf.reshape(dec_outputs,[-1,HIDDEN_SIZE])
				logits = (tf.matmul(output,self.softmax_weight) + self.softmax_bias)
				next_id = tf.argmax(logits,axis=1,output_type=tf.int32)

				trg_ids = trg_ids.write(step+1,next_id[0])
				return next_state,trg_ids,step+1

			state,trg_ids,step = tf.while_loop(
				continue_loop_condition,loop_body,init_loop_var)
			return trg_ids.stack()

def main():
	from stanfordcorenlp import StanfordCoreNLP
	nlp = StanfordCoreNLP(\"../../stanford-corenlp-full-2018-10-05\",lang=\'en\')
	with tf.variable_scope(\"nmt_model\",reuse=None):
		model = NMTModel()
	vocab_file = \"../train.tags.en-zh.en.deletehtml.vocab\"
	sentence = \"It is very beautiful!\"
	with open(vocab_file,\'r\') as f:
		data = f.readlines()
		words = [w.strip() for w in data]
	word_to_id = {k:v for (k,v) in zip(words,range(len(words)))}
	wordlist = nlp.word_tokenize(sentence.strip()) + [\"<eos>\"]
	# print(wordlist)
	idlist = [str(word_to_id[w]) if w in word_to_id else str(word_to_id[\"<unk>\"]) for w in wordlist]
	idlist = [int(i) for i in idlist]
	# print(idlist)

	output_op = model.inference(idlist)
	gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7,allow_growth=True)
	session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
	saver = tf.train.Saver()
	saver.restore(session,CHECKPOINT_PATH)

	output = session.run(output_op)

	vocab_file2 = \"../train.tags.en-zh.zh.deletehtml.vocab\"
	with open(vocab_file2,\'r\') as f2:
		data2 = f2.readlines()
		words = [w.strip() for w in data2]
	id_to_word = {k:v for (k,v) in zip(range(len(words)),words)}
	print([id_to_word[i] for i in output])
	session.close()

	nlp.close()

if __name__ == \'__main__\':
	main()

下面给出预测结果:

[\'<sos>\', \'这\', \'是\', \'非常\', \'美丽\', \'的\', \'!\', \'<eos>\']

由于不想让篇幅过长,所以直接上干货,具体原理请参考开头所说的书,有疑问请在下面留言,我的软件配置是python3.6.5 + tensorflow-gpu==1.12 + cuda9.0。

收藏 打印