Tensorflow: multiple GPUs' performance worse than single CPU in my code -

September 15, 2014

i wrote multi-gpu code simple training. code running on 1cpu+2gpus takes more time on 1 single cpu. set max_step big data such 1000000.

the performance similar as: if 1cpu+2gpus takes 27s while 1 single cpu 20s.

i wonder if wrong op definition in code doesn't utilize gpus fully?

here code:

import tensorflow tf import os import time  def gpu_inference(features,scope):     w = tf.get_variable("weights",shape=(4,1))     b = tf.get_variable("bias",shape=(1))     return tf.matmul(features,w)+b  def gpu_losses(logits,labels,scope):  #all data tf.float32     labels=tf.reshape(labels,shape=(6,1))      delta = tf.square(logits-labels)     losses = tf.reduce_mean(delta)      tf.add_to_collection("losses_coll",losses) #no use??     return losses  def average_gradients(gpu_grads):     #this compute mean of grads.     average_grads = []     grad_of_gpus in zip(*gpu_grads):         grads = []         g, _ in grad_of_gpus:   # (g,_) (grad0_gpu0, var0_gpu0)...             grads.append(g)          grad = tf.reduce_mean(grads, 0)         v = grad_of_gpus[0][1]         grad_and_var = (grad, v)  #v variable, grad value         average_grads.append(grad_and_var)     return average_grads   #define under cpu: #   - variables (weights/bias/global_step/all_grads): w/b need set "reuse". #   - graph: read data train_batch_queue() #   - graph/functions: average_grads #define under gpus:  #   - graph: read batch queue, inference, loss, gpu's grads , put in global val tf.device('/cpu:0'):          csvfiles = os.listdir('./data')     csvfiles = [i in csvfiles if i[-4:]=='.csv' ]     csvfiles = ['./data/'+i in csvfiles]      fileq = tf.train.string_input_producer(csvfiles,shuffle=false);     reader = tf.textlinereader()     key,value = reader.read(fileq)     record_defaults = [[0.0], [0.0], [0.0], [0.0], [0]]     col1, col2, col3, col4, label = tf.decode_csv(value, record_defaults=record_defaults)     feature = tf.stack([col1, col2, col3, col4])      num_gpus = 2     feature_batch, label_batch = tf.train.shuffle_batch([feature, label], batch_size=6, capacity=100, min_after_dequeue=1)      # think queue should on cpu , gpu uses dequeue read batch data. right?     train_batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue( [feature_batch, label_batch], capacity=2 * num_gpus )       max_step = 10000     global_step_val = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=false)     weights_val = tf.get_variable('weights', shape=(4,1), dtype=tf.float32)     bias_val = tf.get_variable('bias', shape=(1), dtype=tf.float32)      #define variable , initialize in cpu:     local_init=tf.local_variables_initializer()  #why need this??     global_init=tf.global_variables_initializer()       gpu_grads = []     in range(num_gpus):         tf.device('/gpu:%d' % i):              tf.name_scope('%s_%d' % ("gpu", i)) scope: #no need?                 tf.get_variable_scope().reuse_variables()                 x_batch, y_batch = train_batch_queue.dequeue()                 y_batch = tf.cast(y_batch,dtype=tf.float32)                 inf_batch = gpu_inference(x_batch,scope)                   loss = gpu_losses(inf_batch,y_batch,scope)                    optimizer = tf.train.gradientdescentoptimizer(0.01)                 grads = optimizer.compute_gradients(loss)                 gpu_grads.append(grads)         #end gpus      avg_grads = average_gradients(gpu_grads) # synchronization point across towers.     train_op = optimizer.apply_gradients(avg_grads, global_step=global_step_val)      sess = tf.session( config=tf.configproto(allow_soft_placement=true) ) #log_device_placement='/cpu:0' wrong. should int     sess.run([local_init,global_init])      coord = tf.train.coordinator()  #generate thread coordinator.queue runner of reading dataset needs it.     threads = tf.train.start_queue_runners(sess=sess,coord=coord)      start_time = time.time()     step in xrange(max_step):         w = tf.get_variable("weights",shape=(4,1))  #for test         b = tf.get_variable("bias",shape=(1))  #for test         _,loss_v,w,b,a_grads,g_step = sess.run([train_op,loss,w,b,avg_grads,global_step_val])      duration = time.time() - start_time     print("**duration is: ",duration)      saver = tf.train.saver()     save_path = saver.save(sess,"./ex3.ckpt")     print("**model saved in file: %s" % save_path)      coord.request_stop()     coord.join(threads)      print("**end**")     #end of device(cpu)

Search This Blog

Insert

Tensorflow: multiple GPUs' performance worse than single CPU in my code -

Comments

Post a Comment

Popular posts from this blog

vue.js - Create hooks for automated testing -

php - Vagrant up error - Uncaught Reflection Exception: Class DOMDocument does not exist -

serial port - hub4com OVERRUN Error -