Advanced Tensorflow Usage and Experiment: from input_fn to modelling part


Background

While I interned at my current company - Kuaishou, I was in part of the project where I need to set up a LSTM model, and I have to transfer my skill sets to Tensorflow from Pytorch in a short period of time , which was really painful experience for me. So I want to share this blog with you to ease your pain.

Some complaints

Although I have to admit Tensorflow is a very powerful computing framework: it has much stronger and larger community to support it compared with Pytorch; it allows you to get accees to some fancy, high-level attributes like tf.serving, tensorboard, distributed computation, etc, it has very steep learning curve and be super extra unfriendly to new users because of its rapid shift of version. Maybe you are first struggled with tf.placeholder version of data preprocessing and modelling and suddenly you find out it's not how we use it anymore, how would you feel then? Well, if it's the first blog you see, you know what i mean in a few hours...

What's exactly in this blog

In this blog I present most of experiments I did while writing the LSTM model code with Tensorflow version 1.9 and Python 3.6. I split my toy experiments into 4 parts:

  • data input process
  • LSTM cell test
  • Shared embedding test
  • Miscellany function

Who should read this blog

You should already get some taste of the tensorflow and want to know more detailed and useful knowledge without writing these toy examples yourself. I'm very confident these functions or tests will be helpful if you are dealing with high volumn of data and want to format your code neatly.

Data input process

tfrecord saving, reading

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import tensorflow as tf 
import numpy as np

def writedata():
xlist = [[1,2,3],[4,5,6,8],[23,1]]
xstar_list = [[15, 25],[23.1],[1,2,3]]
ylist = [1,2,3]
#name_list = [['a'.encode('utf8'), 'b'.encode('utf8')], ['a'.encode('utf8'), 'c'.encode('utf8'), 'd'.encode('utf8')]]
name_list = [['a'.encode('utf8'), 'b'.encode('utf8')], ['a'.encode('utf8')],['b'.encode('utf8')]]

writer = tf.python_io.TFRecordWriter("train.tfrecords")
for i in range(3):
x = xlist[i]
y = ylist[i]
x_star = xstar_list[i]
name = name_list[i]
example = tf.train.Example(features=tf.train.Features(feature={
"y": tf.train.Feature(int64_list=tf.train.Int64List(value=[y])),
'x': tf.train.Feature(int64_list=tf.train.Int64List(value=x)),
'x_star': tf.train.Feature(float_list=tf.train.FloatList(value=x_star)),
'name_star':tf.train.Feature(bytes_list= tf.train.BytesList(value= name))

}))
writer.write(example.SerializeToString())
writer.close()

writedata()

Experiment over this part

  1. sparse tensor can be used in embedding layer directly,it will ignore default value and also won't be count into denominator when the combiner is mean
  2. sparse_tensor_to_dense must have corresponding type of default_value to it's own data type ,like int,float can use 0 as default value, while tf.string has to be str,e.g. '0'.
  3. If you use tf.dataset.batch, and have various length data, then you should use parse_example instead of parse_single_example
  4. tf.pad can also pad outer dimension, which means increase a layer of zero outside your data
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import tensorflow as tf 
import numpy as np


def my_input_fn(file_path, perform_shuffle=False, repeat_count=10):
def parse(example_proto):
features = {"x": tf.VarLenFeature(tf.int64),
"x_star": tf.VarLenFeature(tf.float32),
'name_star':tf.VarLenFeature(tf.string),
"y": tf.FixedLenFeature([1], tf.int64)}
parsed_features = tf.parse_example(example_proto, features)
x = tf.sparse_tensor_to_dense(parsed_features["x"])
x = tf.cast(x, tf.int32)

parsed_features['x'] = x
parsed_features['x_star'] = tf.sparse_tensor_to_dense(parsed_features["x_star"])
parsed_features['name_star'] = tf.sparse_tensor_to_dense(parsed_features["name_star"], default_value= '0')

y = tf.cast(parsed_features["y"], tf.int32)
return parsed_features, y


dataset = tf.data.TFRecordDataset(file_path).batch(3)
dataset= dataset.map(parse)

#dataset = (tf.data.TFRecordDataset(file_path).batch(2).map(parse))
if perform_shuffle:
dataset = dataset.shuffle(buffer_size=256)
dataset = dataset.repeat(repeat_count)
#dataset = dataset.padded_batch(2, padded_shapes=({'x':[6]},[1])) #batch size为2,并且x按maxlen=6来做padding
#dataset = dataset.padded_batch(2, padded_shapes=dataset.output_shapes) #batch size为2,并且x按maxlen=6来做padding

iterator = dataset.make_one_shot_iterator()
batch_features, batch_labels = iterator.get_next()
return batch_features, batch_labels

next_batch = my_input_fn('train.tfrecords', True)


##test embedding
feat_columns = [] #feat_columns must be iterable datatype
feat_columns.append(tf.feature_column.numeric_column('x', 4)) #0
feat_columns.append(tf.feature_column.numeric_column('x_star',2)) #1


embed_info = tf.feature_column.categorical_column_with_vocabulary_list(
'name_star',
['a','b','c','d','e'], default_value=-1,
dtype=tf.string, num_oov_buckets=2)

name_tensor= tf.feature_column.embedding_column(embed_info, 5, combiner='mean')

#feat_columns.append(tf.feature_column.shared_embedding_columns([embed_info], dimension =32)[0])
#feat_columns.append(tf.feature_column.embedding_column(embed_info, dimension =32)) #2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
op=tf.feature_column.input_layer(next_batch[0],feature_columns=name_tensor)
with tf.Session() as sess:
#sess.run(init)
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())
print(sess.run([op]))

sess.run(tf.global_variables_initializer())
xs, y =sess.run(next_batch)
#print(sess.run([tf.feature_column.input_layer(xs,feature_columns=[name_tensor])]))
print(np.size(xs['x'])) # 2*4 batch_size 2
print('x: ', xs['x'])
print('x_star: ', xs['x_star'])
print('y:', y)

print('testing pad for lstm')
#print(sess.run(tf.pad(xs['x'],tf.constant([[0,0],[0,0]], dtype=tf.int32))))

print('testing feature columns for lstm')
#print(sess.run(tf.feature_column.input_layer(xs,feature_columns=feat_columns[0])))

print('testing string feature columns')
print(xs['name_star'])
#print(feat_columns[2])
#print(sess.run([op]))

print(np.squeeze(xs['x'][1]))
print(xs['x'][range(2),[1,2]])
print(type(y))
print(np.size(y))

[array([[ 0.29952478, -0.02905731, -0.14833574, -0.25489837, 0.13668409], [ 0.36393905, -0.18847883, 0.01317748, 0.02921137, -0.3228819 ], [ 0.20875314, -0.471264 , -0.23475473, -0.10564104, -0.1293019 ]], dtype=float32)] 12 x: [[ 1 2 3 0] [ 4 5 6 8] [23 1 0 0]] x_star: [[15. 25. 0. ] [23.1 0. 0. ] [ 1. 2. 3. ]] y: [[1] [2] [3]] testing pad for lstm testing feature columns for lstm testing string feature columns [[b'a' b'b'] [b'a' b'0'] [b'b' b'0']] [4 5 6 8] [2 6] <class 'numpy.ndarray'> 3

Tensorflow LSTM CELL

Single cell case

test the output of LSTM cell

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import tensorflow as tf
import numpy as np
from tensorflow.contrib import rnn
tf.reset_default_graph()

# Create input data
X = np.random.randn(2, 10, 8)

# The second example is of length 6
X[1,6:] = 0 # 2D: include all third dimension
X_lengths = [10, 6]

cell = tf.nn.rnn_cell.LSTMCell(num_units=64, state_is_tuple=True)

init_state = np.random.randn(2, 64) #batch_size * hidden_state_size
init_s = tf.contrib.rnn.LSTMStateTuple(c= tf.convert_to_tensor(init_state), h= tf.convert_to_tensor(init_state))


outputs, states = tf.nn.dynamic_rnn(cell=cell,
dtype=tf.float64,
initial_state = init_s,
sequence_length=X_lengths,
inputs=X)


with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
output_val, state_val = sess.run([outputs, states])
print(output_val.shape)
print(state_val.c.shape)




assert outputs.shape == (2, 10, 64)
# the final outputs state and states returned must be equal for each
# sequence
print(outputs) #all ten outputs of two samples
print(states) #cell states and hidden units: all two sequence state output of two trajectory

(2, 10, 64) (2, 64) Tensor("rnn/transpose_1:0", shape=(2, 10, 64), dtype=float64) LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_3:0' shape=(2, 64) dtype=float64>, h=<tf.Tensor 'rnn/while/Exit_4:0' shape=(2, 64) dtype=float64>)

Multicell case

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
tf.reset_default_graph()

#cell = tf.nn.rnn_cell.LSTMCell(num_units=64, state_is_tuple=True)
multi_cell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.LSTMCell(num_units=64, state_is_tuple=True), tf.nn.rnn_cell.LSTMCell(num_units=64, state_is_tuple=True)])

init_state = np.random.randn(2, 64)
init_s = tf.contrib.rnn.LSTMStateTuple(c= tf.convert_to_tensor(init_state), h= tf.convert_to_tensor(init_state))


outputs, states = tf.nn.dynamic_rnn(cell=multi_cell,
dtype=tf.float64,
initial_state = (init_s, init_s),
sequence_length=X_lengths,
inputs=X)


with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
a= sess.run(tf.sigmoid(X))

output_val, state_val = sess.run([outputs, states])
print(type(output_val))
print(output_val[:,1:,:].shape)
print('Something related to StateTuple')

<class 'numpy.ndarray'> (2, 9, 64) Something related to StateTuple

fast predict

This is a tricky part for my project, I will explain why we need and what it is shortly.

tensorflow fast predict

use generator to keep .predict open

Why you need this? Detailed explanation here

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import tensorflow as tf

class FastPredict:

def __init__(self, estimator, input_fn):
self.estimator = estimator
self.first_run = True
self.closed = False
self.input_fn = input_fn

def _create_generator(self):
while not self.closed:
yield self.next_features

def predict(self, feature_batch):
""" Runs a prediction on a set of features. Calling multiple times
does *not* regenerate the graph which makes predict much faster.
feature_batch a list of list of features. IMPORTANT: If you're only classifying 1 thing,
you still need to make it a batch of 1 by wrapping it in a list (i.e. predict([my_feature]), not predict(my_feature)
"""
self.next_features = feature_batch
if self.first_run:
self.batch_size = len(feature_batch)
self.predictions = self.estimator.predict(
input_fn=self.input_fn(self._create_generator))
self.first_run = False
elif self.batch_size != len(feature_batch):
raise ValueError("All batches must be of the same size. First-batch:" + str(self.batch_size) + " This-batch:" + str(len(feature_batch)))

results = []
for _ in range(self.batch_size):
results.append(next(self.predictions))
return results

def close(self):
self.closed = True
try:
next(self.predictions)
except:
print("Exception in fast_predict. This is probably OK")


def example_input_fn(generator):
""" An example input function to pass to predict. It must take a generator as input """

def _inner_input_fn():
dataset = tf.data.Dataset().from_generator(generator, output_types={'a':(10,),'b':(2,)}).batch(1)
iterator = dataset.make_one_shot_iterator()
features = iterator.get_next()
return features

return _inner_input_fn

tf.data.Dataset().from_generator

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import numpy as np
def _create_generator():
for i in range(3):
a = np.random.randn(3,2)
b = np.random.randn(2)
result = {}
result['a'] = a
result['b'] = b
yield result


gen = _create_generator()

dataset = tf.data.Dataset().from_generator(_create_generator,
output_shapes={'a':(3,2),'b':2},
output_types ={'a':tf.float32, 'b':tf.float32}).batch(1)
iterator = dataset.make_one_shot_iterator()
features = iterator.get_next()


init = tf.initialize_all_variables()
with tf.Session() as sess:
sess.run(init)
print(sess.run(features))

WARNING:tensorflow:From /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/util/tf_should_use.py:118: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02. Instructions for updating: Use tf.global_variables_initializer instead. {'a': array([[[ 0.6078665 , 0.7673362 ], [-1.095272 , -0.44154257], [ 0.24826635, 1.8101764 ]]], dtype=float32), 'b': array([[-1.3138337 , 0.05587422]], dtype=float32)}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import numpy as np
def _create_generator():
for i in range(3):
a = np.random.randn(3,2)
b = np.random.randn(2)
result = {}
result['a'] = tf.contrib.rnn.LSTMStateTuple(c=a, h=b)
yield result


gen = _create_generator()

dataset = tf.data.Dataset().from_generator(_create_generator,
output_types ={'a':tf.contrib.rnn.LSTMStateTuple(c=tf.float32, h=tf.float32)}).batch(1)

iterator = dataset.make_one_shot_iterator()
features = iterator.get_next()


init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
g = sess.run(features)
print(g['a'])

LSTMStateTuple(c=array([[[ 0.15593866, -1.1535455 ], [ 0.8337963 , 0.3000586 ], [ 1.3395942 , -0.65611506]]], dtype=float32), h=array([[ 0.5950521 , -0.82992613]], dtype=float32))

1
2
import os
print(os.environ.get('CONFIG'))

None

1
2
3
4
5
6
import tensorflow as tf
hparams = tf.contrib.training.HParams(
learning_rate=0.1, num_hidden_units=100, activation='relu')
print(hparams.get('learning_rate2', None))
hparams.add_hparam('test2', 2)
#None > 2 # can't compare None with 2

None

1
2
3
4
5
import configparser as cp
target = cp.ConfigParser()
config = target.read('single.ini')

#use config to get variours params

test shared embedding

How to use embedding

tf.feature_column.input_layer(.., ..)

How tf.Varlen react when read in batch

data is in format tf.SparseTensor

embedding: make_parse_example_spec

it is tf.Varlen because embedding can be multiple dimension, and then you can combine them by sum, mean, etc.

1
2
3
4
5
6
7
fc = tf.feature_column.categorical_column_with_hash_bucket('my_fc',
10)

shared_columns = tf.feature_column.shared_embedding_columns([fc],
32, combiner='sum',
shared_embedding_collection_name='my_em_fc')
tf.feature_column.make_parse_example_spec(shared_columns)

{'my_fc': VarLenFeature(dtype=tf.string)}

1
2
3
4
5
6
7
fc = tf.feature_column.categorical_column_with_vocabulary_file('my_fc',vocabulary_file='abc',vocabulary_size=100, 
num_oov_buckets= 10)

shared_columns = tf.feature_column.shared_embedding_columns([fc],
32, combiner='sum',
shared_embedding_collection_name='my_em_fc')
tf.feature_column.make_parse_example_spec(shared_columns)

{'my_fc': VarLenFeature(dtype=tf.string)}

check tf.varlen

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
fc = tf.feature_column.categorical_column_with_vocabulary_file('my_fc',vocabulary_file='abc',vocabulary_size=100, 
num_oov_buckets= 10)

shared_columns = tf.feature_column.shared_embedding_columns([fc],
32, combiner='sum',
shared_embedding_collection_name='my_em_fc', )
target = tf.feature_column.make_parse_example_spec(shared_columns)
for key, item in target.items():
print(type(item) == tf.VarLenFeature)
print(item)
print(key)
print(type(shared_columns[0]))
print(shared_columns[0])
print('_SharedEmbeddingColumn' in str(type(shared_columns[0])))
a= shared_columns[0]
print(shared_columns[0].name)

True VarLenFeature(dtype=tf.string) my_fc <class 'tensorflow.python.feature_column.feature_column._SharedEmbeddingColumn'> _SharedEmbeddingColumn(categorical_column=_VocabularyFileCategoricalColumn(key='my_fc', vocabulary_file='abc', vocabulary_size=100, num_oov_buckets=10, dtype=tf.string, default_value=-1), dimension=32, combiner='sum', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x11e6f6668>, shared_embedding_collection_name='my_em_fc', ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True) True my_fc_shared_embedding

check naming of these method

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
fc = tf.feature_column.categorical_column_with_vocabulary_list('my_fc',vocabulary_list=['a','b'], 
num_oov_buckets= 10)

shared_columns = tf.feature_column.shared_embedding_columns([fc],
32, combiner='mean',
shared_embedding_collection_name='my_em_fc')

indicated_columns = tf.feature_column.indicator_column(fc)



boundaries= [1.0,2.0]
source_column = tf.feature_column.numeric_column('day', 2)
bucketized_column = tf.feature_column.bucketized_column(source_column,boundaries=boundaries)


target = tf.feature_column.make_parse_example_spec(shared_columns)
print(indicated_columns.name)
a= shared_columns[0]
print(a.name[:-17])
print(a)
a.dimension
a.combiner == 'mean'
print(target)
print('target keys')
print(list(target.keys())[0]) #change datatype to list to support indexing

print(source_column.name)
print(bucketized_column.name)

print(tf.feature_column.make_parse_example_spec(indicated_columns))

print(tf.feature_column.make_parse_example_spec([source_column]))
print(tf.feature_column.make_parse_example_spec([bucketized_column]))
target['my_fc'].dtype==tf.string

my_fc_indicator my_fc _SharedEmbeddingColumn(categorical_column=_VocabularyListCategoricalColumn(key='my_fc', vocabulary_list=('a', 'b'), dtype=tf.string, default_value=-1, num_oov_buckets=10), dimension=32, combiner='mean', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x11e6f3668>, shared_embedding_collection_name='my_em_fc', ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True) {'my_fc': VarLenFeature(dtype=tf.string)} target keys my_fc day day_bucketized {'my_fc': VarLenFeature(dtype=tf.string)} {'day': FixedLenFeature(shape=(2,), dtype=tf.float32, default_value=None)} {'day': FixedLenFeature(shape=(2,), dtype=tf.float32, default_value=None)} True

embedding - input_layer

embedding layer should be defined before you tf.initialize. Because in most cases is also a trainable layer.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
target = {}
target['name_star'] = [[b'a',b'b',b''],[b'a',b'c',b'd']]

embed_info = tf.feature_column.categorical_column_with_vocabulary_list(
'name_star',
['a','b','c','d','e'], default_value=-1,
dtype=tf.string, num_oov_buckets=2)

name_tensor= tf.feature_column.embedding_column(embed_info, 5)

op = tf.feature_column.input_layer(target,feature_columns=name_tensor)


with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())
for i in range(1):
print(sess.run([op]))

Miscellany

tf.gather_nd

1
2
3
4
5
6
7
8
9
10
11
12
a=  np.array([[1,2,3],[4,5,6]])
b = tf.convert_to_tensor(a)

index =[[0,0],[1,b[0,0]]]


init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
print(sess.run(tf.gather_nd(b, index)))
print(type(b))
print(b[:,1])

tf.less + tf.where

1
2
3
4
5
6
7
8
9
10
11
12
import tensorflow as tf
import numpy as np
a= np.array([1,2,3,4,2,4,2])
b = tf.convert_to_tensor(a)

result =tf.where(tf.less(b, 3),tf.ones_like(b), tf.zeros_like(b))

init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
print(type(b))
print(sess.run(result))

Tensor computation

1
2
3
4
5
6
a = tf.constant([[2,3,5],[4,5,7]])
b = tf.constant([2,3])
with tf.Session() as sess:
print(sess.run(a))
print(sess.run(b))
print(sess.run(a/tf.expand_dims(b, 1)))

Tensor reshape

确认rehsape逻辑,[batch_size, max_len, feature_dim]

1
2
3
4
a = tf.constant([[2,3,5],[4,5,7],[3,4,5],[1,5,9]])

with tf.Session() as sess:
print(sess.run(tf.reshape(a, [2,2,3])))

import .py from parent directory

1
2
3
import os, sys

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
Haonan Li wechat