Open AI-Friend opened 4 years ago
分布式训练,ps servers 分配哪些参数,workers分配哪些数据是在什么地方实现的 只看到run_loop.py 里有一个run_distributed()函数 def run_distributed(flags_obj, run): cluster = tf.train.ClusterSpec({ 'ps': flags_obj.ps_hosts, 'worker': flags_obj.worker_hosts }) server = tf.train.Server( cluster, job_name=flags_obj.job_name, task_index=flags_obj.task_index)
if flags_obj.job_name == 'ps': server.join() elif flags_obj.job_name == 'worker': if not euler_ops.initialize_shared_graph(directory=flags_obj.data_dir, zk_addr=flags_obj.euler_zk_addr, zk_path=flags_obj.euler_zk_path, shard_idx=flags_obj.task_index, shard_num=len(flags_obj.worker_hosts), global_sampler_type='node'): raise RuntimeError('Failed to initialize graph.') with tf.device( tf.train.replica_device_setter( worker_device='/job:worker/task:%d' % flags_obj.task_index, cluster=cluster)): run(flags_obj, server.target, flags_obj.task_index == 0) else: raise ValueError('Unsupport role: {}'.format(flags_obj.job_name))
模型参数都在ps上
分布式训练,ps servers 分配哪些参数,workers分配哪些数据是在什么地方实现的 只看到run_loop.py 里有一个run_distributed()函数 def run_distributed(flags_obj, run): cluster = tf.train.ClusterSpec({ 'ps': flags_obj.ps_hosts, 'worker': flags_obj.worker_hosts }) server = tf.train.Server( cluster, job_name=flags_obj.job_name, task_index=flags_obj.task_index)