Tensorflow: multiple GPUs' performance worse than single CPU in my code -
i wrote multi-gpu code simple training. code running on 1cpu+2gpus takes more time on 1 single cpu. set max_step big data such 1000000.
the performance similar as: if 1cpu+2gpus takes 27s while 1 single cpu 20s.
i wonder if wrong op definition in code doesn't utilize gpus fully?
here code:
import tensorflow tf import os import time def gpu_inference(features,scope): w = tf.get_variable("weights",shape=(4,1)) b = tf.get_variable("bias",shape=(1)) return tf.matmul(features,w)+b def gpu_losses(logits,labels,scope): #all data tf.float32 labels=tf.reshape(labels,shape=(6,1)) delta = tf.square(logits-labels) losses = tf.reduce_mean(delta) tf.add_to_collection("losses_coll",losses) #no use?? return losses def average_gradients(gpu_grads): #this compute mean of grads. average_grads = [] grad_of_gpus in zip(*gpu_grads): grads = [] g, _ in grad_of_gpus: # (g,_) (grad0_gpu0, var0_gpu0)... grads.append(g) grad = tf.reduce_mean(grads, 0) v = grad_of_gpus[0][1] grad_and_var = (grad, v) #v variable, grad value average_grads.append(grad_and_var) return average_grads #define under cpu: # - variables (weights/bias/global_step/all_grads): w/b need set "reuse". # - graph: read data train_batch_queue() # - graph/functions: average_grads #define under gpus: # - graph: read batch queue, inference, loss, gpu's grads , put in global val tf.device('/cpu:0'): csvfiles = os.listdir('./data') csvfiles = [i in csvfiles if i[-4:]=='.csv' ] csvfiles = ['./data/'+i in csvfiles] fileq = tf.train.string_input_producer(csvfiles,shuffle=false); reader = tf.textlinereader() key,value = reader.read(fileq) record_defaults = [[0.0], [0.0], [0.0], [0.0], [0]] col1, col2, col3, col4, label = tf.decode_csv(value, record_defaults=record_defaults) feature = tf.stack([col1, col2, col3, col4]) num_gpus = 2 feature_batch, label_batch = tf.train.shuffle_batch([feature, label], batch_size=6, capacity=100, min_after_dequeue=1) # think queue should on cpu , gpu uses dequeue read batch data. right? train_batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue( [feature_batch, label_batch], capacity=2 * num_gpus ) max_step = 10000 global_step_val = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=false) weights_val = tf.get_variable('weights', shape=(4,1), dtype=tf.float32) bias_val = tf.get_variable('bias', shape=(1), dtype=tf.float32) #define variable , initialize in cpu: local_init=tf.local_variables_initializer() #why need this?? global_init=tf.global_variables_initializer() gpu_grads = [] in range(num_gpus): tf.device('/gpu:%d' % i): tf.name_scope('%s_%d' % ("gpu", i)) scope: #no need? tf.get_variable_scope().reuse_variables() x_batch, y_batch = train_batch_queue.dequeue() y_batch = tf.cast(y_batch,dtype=tf.float32) inf_batch = gpu_inference(x_batch,scope) loss = gpu_losses(inf_batch,y_batch,scope) optimizer = tf.train.gradientdescentoptimizer(0.01) grads = optimizer.compute_gradients(loss) gpu_grads.append(grads) #end gpus avg_grads = average_gradients(gpu_grads) # synchronization point across towers. train_op = optimizer.apply_gradients(avg_grads, global_step=global_step_val) sess = tf.session( config=tf.configproto(allow_soft_placement=true) ) #log_device_placement='/cpu:0' wrong. should int sess.run([local_init,global_init]) coord = tf.train.coordinator() #generate thread coordinator.queue runner of reading dataset needs it. threads = tf.train.start_queue_runners(sess=sess,coord=coord) start_time = time.time() step in xrange(max_step): w = tf.get_variable("weights",shape=(4,1)) #for test b = tf.get_variable("bias",shape=(1)) #for test _,loss_v,w,b,a_grads,g_step = sess.run([train_op,loss,w,b,avg_grads,global_step_val]) duration = time.time() - start_time print("**duration is: ",duration) saver = tf.train.saver() save_path = saver.save(sess,"./ex3.ckpt") print("**model saved in file: %s" % save_path) coord.request_stop() coord.join(threads) print("**end**") #end of device(cpu)
Comments
Post a Comment