I ran the command "scripts/run_local.sh nets/resnet_at_cifar10_run.py --learner channel --cp_prune_option list"
There was a process in GPU:0 before I ran this command.
I checked the GPU states by "Nvidia -smi", so I was sure that this command running in GPU:1 and there was no process in GPU:0 before this command.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/server9/lst/pocketFlow/main.py", line 74, in
tf.app.run()
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/platform/app.py", line 125, in run
_sys.exit(main(argv))
File "/home/server9/lst/pocketFlow/main.py", line 60, in main
learner.train()
File "/home/server9/lst/pocketFlow/learners/channel_pruning/learner.py", line 150, in train
self.prune_and_finetune_list()
File "/home/server9/lst/pocketFlow/learners/channel_pruning/learner.py", line 578, in prune_and_finetune_list
done = self.prune_list_layers(queue, [FLAGS.cp_list_group])
File "/home/server9/lst/pocketFlow/learners/channel_pruning/learner.py", line 583, in prune_list_layers
done = self.prune_n_layers(p, queue)
File "/home/server9/lst/pocketFlow/learners/channel_pruning/learner.py", line 590, in prune_n_layers
self.pruner.extract_features()
File "/home/server9/lst/pocketFlow/learners/channel_pruning/channel_pruner.py", line 324, in extract_features
feats = self._model.sess.run(names, feed_dict={self.mem_images: data})
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 929, in run
run_metadata_ptr)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1152, in _run
feed_dict_tensor, options, run_metadata)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1328, in _do_run
run_metadata)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1348, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InternalError: Blas SGEMM launch failed : m=131072, n=16, k=16
[[node model/resnet_model/conv2d_1/Conv2D (defined at /home/server9/lst/pocketFlow/learners/channel_pruning/learner.py:162) = Conv2D[T=DT_FLOAT, data_format="NHWC", dilations=[1, 1, 1, 1], padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](model/resnet_model/Relu-0-0-TransposeNCHWToNHWC-LayoutOptimizer, model/resnet_model/conv2d_1/kernel/read)]]
[[{{node model/resnet_model/conv2d_13/Conv2D/_231}} = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_402_model/resnet_model/conv2d_13/Conv2D", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
Caused by op 'model/resnet_model/conv2d_1/Conv2D', defined at:
File "/home/server9/lst/pocketFlow/main.py", line 74, in
tf.app.run()
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/platform/app.py", line 125, in run
_sys.exit(main(argv))
File "/home/server9/lst/pocketFlow/main.py", line 60, in main
learner.train()
File "/home/server9/lst/pocketFlow/learners/channel_pruning/learner.py", line 141, in train
self.create_pruner()
File "/home/server9/lst/pocketFlow/learners/channel_pruning/learner.py", line 162, in create_pruner
self.saver = tf.train.import_meta_graph(FLAGS.cp_original_path + '.meta')
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py", line 1674, in import_meta_graph
meta_graph_or_file, clear_devices, import_scope, kwargs)[0]
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py", line 1696, in _import_meta_graph_with_return_elements
kwargs))
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/meta_graph.py", line 806, in import_scoped_meta_graph_with_return_elements
return_elements=return_elements)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
return func(*args, **kwargs)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/importer.py", line 442, in import_graph_def
_ProcessNewOps(graph)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/importer.py", line 234, in _ProcessNewOps
for new_op in graph._add_new_tf_operations(compute_devices=False): # pylint: disable=protected-access
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 3440, in _add_new_tf_operations
for c_op in c_api_util.new_tf_operations(self)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 3440, in
for c_op in c_api_util.new_tf_operations(self)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 3299, in _create_op_from_tf_operation
ret = Operation(c_op, self)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 1770, in init
self._traceback = tf_stack.extract_stack()
I ran the command "scripts/run_local.sh nets/resnet_at_cifar10_run.py --learner channel --cp_prune_option list" There was a process in GPU:0 before I ran this command. I checked the GPU states by "Nvidia -smi", so I was sure that this command running in GPU:1 and there was no process in GPU:0 before this command.
[WARNING] TF-Plus & Horovod cannot be imported; multi-GPU training is unsupported INFO:tensorflow:FLAGS: INFO:tensorflow:nb_classes: 10 INFO:tensorflow:uql_w_bit_min: 2 INFO:tensorflow:nuql_quantize_all_layers: False INFO:tensorflow:uql_enbl_rl_layerwise_tune: False INFO:tensorflow:ddpg_bsln_decy_rate: 0.95 INFO:tensorflow:uql_enbl_rl_global_tune: True INFO:tensorflow:uqtf_save_path: ./models_uqtf/model.ckpt INFO:tensorflow:ddpg_actor_depth: 2 INFO:tensorflow:nuql_bucket_type: split INFO:tensorflow:lrn_rate_init: 0.1 INFO:tensorflow:cp_lasso: True INFO:tensorflow:uql_bucket_type: channel INFO:tensorflow:uqtf_weight_bits: 8 INFO:tensorflow:cpr_prune_ratio: 0.5 INFO:tensorflow:uqtf_activation_bits: 8 INFO:tensorflow:cp_list_group: 1000 INFO:tensorflow:ws_iter_ratio_beg: 0.1 INFO:tensorflow:ddpg_gamma: 0.9 INFO:tensorflow:ws_prune_ratio_prtl: optimal INFO:tensorflow:buffer_size: 1024 INFO:tensorflow:uql_w_bit_max: 8 INFO:tensorflow:h: False INFO:tensorflow:nuql_w_bit_max: 8 INFO:tensorflow:enbl_fused_batchnorm: True INFO:tensorflow:uql_activation_bits: 32 INFO:tensorflow:ddpg_record_step: 1 INFO:tensorflow:cpr_save_path_ws: ./models_cpr_ws/model.ckpt INFO:tensorflow:uql_nb_rlouts: 200 INFO:tensorflow:tempr_dst: 4.0 INFO:tensorflow:cpr_nb_crops_per_smpl: 10 INFO:tensorflow:ws_nb_rlouts: 200 INFO:tensorflow:uqtf_save_path_probe_eval: ./models_uqtf_probe_eval/model.ckpt INFO:tensorflow:nuql_activation_bits: 32 INFO:tensorflow:cpg_save_path: ./models_cpg/model.ckpt INFO:tensorflow:cpr_lstsq_nb_iters: 100 INFO:tensorflow:ddpg_enbl_bsln_func: True INFO:tensorflow:uqtf_quant_delay: 0 INFO:tensorflow:prefetch_size: 8 INFO:tensorflow:cpg_lrn_rate_pgd_init: 1e-10 INFO:tensorflow:cp_retrain: False INFO:tensorflow:ddpg_noise_prtl: tdecy INFO:tensorflow:cp_reward_policy: accuracy INFO:tensorflow:cycle_length: 4 INFO:tensorflow:loss_w_dst: 4.0 INFO:tensorflow:uqtf_freeze_bn_delay: None INFO:tensorflow:cpr_skip_op_names: None INFO:tensorflow:save_path_eval: ./models_eval/model.ckpt INFO:tensorflow:resnet_size: 20 INFO:tensorflow:ws_nb_iters_rg: 20 INFO:tensorflow:nuql_use_buckets: False INFO:tensorflow:ddpg_tau: 0.01 INFO:tensorflow:uql_enbl_rl_agent: False INFO:tensorflow:enbl_dst: False INFO:tensorflow:uql_enbl_random_layers: True INFO:tensorflow:exec_mode: train INFO:tensorflow:cp_noise_tolerance: 0.15 INFO:tensorflow:uql_tune_disp_steps: 300 INFO:tensorflow:nuql_tune_save_path: ./rl_tune_models/model.ckpt INFO:tensorflow:dcp_save_path: ./models_dcp/model.ckpt INFO:tensorflow:nuql_enbl_rl_global_tune: True INFO:tensorflow:uql_equivalent_bits: 4 INFO:tensorflow:ws_nb_iters_ft: 400 INFO:tensorflow:nb_epochs_rat: 1.0 INFO:tensorflow:cpg_lrn_rate_pgd_decr: 0.7 INFO:tensorflow:ws_lrn_rate_ft: 0.0003 INFO:tensorflow:ddpg_loss_w_dcy: 0.0 INFO:tensorflow:summ_step: 100 INFO:tensorflow:nuql_enbl_rl_agent: False INFO:tensorflow:uqtf_save_path_probe: ./models_uqtf_probe/model.ckpt INFO:tensorflow:uql_tune_layerwise_steps: 100 INFO:tensorflow:cpg_nb_iters_layer: 1000 INFO:tensorflow:ddpg_noise_type: param INFO:tensorflow:cpr_ista_lrn_rate: 0.01 INFO:tensorflow:uql_quantize_all_layers: False INFO:tensorflow:batch_size: 128 INFO:tensorflow:cpg_skip_ht_layers: True INFO:tensorflow:ddpg_noise_adpt_rat: 1.03 INFO:tensorflow:cp_lrn_rate_ft: 0.0001 INFO:tensorflow:batch_size_eval: 128 INFO:tensorflow:helpfull: False INFO:tensorflow:nuql_init_style: quantile INFO:tensorflow:cp_original_path: ./models/original_model.ckpt INFO:tensorflow:ddpg_actor_width: 64 INFO:tensorflow:cpr_ista_nb_iters: 100 INFO:tensorflow:cpg_lrn_rate_pgd_incr: 1.4 INFO:tensorflow:ddpg_rms_eps: 0.0001 INFO:tensorflow:learner: channel INFO:tensorflow:enbl_warm_start: False INFO:tensorflow:loss_w_dcy: 0.0002 INFO:tensorflow:ddpg_lrn_rate: 0.001 INFO:tensorflow:uql_tune_global_steps: 2000 INFO:tensorflow:uql_weight_bits: 4 INFO:tensorflow:ws_reward_type: single-obj INFO:tensorflow:nuql_equivalent_bits: 4 INFO:tensorflow:nuql_w_bit_min: 2 INFO:tensorflow:cp_finetune: False INFO:tensorflow:ws_prune_ratio: 0.75 INFO:tensorflow:dcp_nb_iters_block: 10000 INFO:tensorflow:ddpg_batch_size: 64 INFO:tensorflow:dcp_lrn_rate_adam: 0.001 INFO:tensorflow:nuql_save_quant_model_path: ./nuql_quant_models/model.ckpt INFO:tensorflow:cpg_prune_ratio_file: None INFO:tensorflow:nuql_enbl_rl_layerwise_tune: False INFO:tensorflow:log_dir: ./logs INFO:tensorflow:uql_quant_epochs: 60 INFO:tensorflow:ddpg_noise_std_finl: 1e-05 INFO:tensorflow:data_hdfs_host: None INFO:tensorflow:data_dir_local: /home/server9/lst/pocketFlow/data/cifar-10-batches-bin INFO:tensorflow:cp_preserve_ratio: 0.5 INFO:tensorflow:save_step: 10000 INFO:tensorflow:uqtf_lrn_rate_dcy: 0.01 INFO:tensorflow:dcp_save_path_eval: ./models_dcp_eval/model.ckpt INFO:tensorflow:nuql_tune_global_steps: 2101 INFO:tensorflow:batch_size_norm: 128.0 INFO:tensorflow:ws_prune_ratio_exp: 3.0 INFO:tensorflow:cpg_lrn_rate_adam: 0.01 INFO:tensorflow:cp_channel_pruned_path: ./models/pruned_model.ckpt INFO:tensorflow:save_path_dst: ./models_dst/model.ckpt INFO:tensorflow:cpr_save_path: ./models_cpr/model.ckpt INFO:tensorflow:cp_prune_list_file: ratio.list INFO:tensorflow:uql_bucket_size: 256 INFO:tensorflow:save_path: ./models/model.ckpt INFO:tensorflow:model_http_url: https://api.ai.tencent.com/pocketflow INFO:tensorflow:uql_save_quant_model_path: ./uql_quant_models/uql_quant_model.ckpt INFO:tensorflow:ws_iter_ratio_end: 0.5 INFO:tensorflow:ws_lrn_rate_rg: 0.03 INFO:tensorflow:momentum: 0.9 INFO:tensorflow:nb_smpls_train: 50000 INFO:tensorflow:nuql_tune_layerwise_steps: 100 INFO:tensorflow:dcp_nb_iters_layer: 500 INFO:tensorflow:ddpg_noise_dst_finl: 0.01 INFO:tensorflow:uql_tune_save_path: ./rl_tune_models/model.ckpt INFO:tensorflow:cp_prune_option: list INFO:tensorflow:cpg_prune_ratio_type: uniform INFO:tensorflow:cp_quadruple: False INFO:tensorflow:uqtf_save_path_eval: ./models_uqtf_eval/model.ckpt INFO:tensorflow:nb_smpls_eval: 10000 INFO:tensorflow:cpr_warm_start: False INFO:tensorflow:data_disk: local INFO:tensorflow:cp_uniform_preserve_ratio: 0.6 INFO:tensorflow:cpg_save_path_eval: ./models_cpg_eval/model.ckpt INFO:tensorflow:nuql_quant_epochs: 60 INFO:tensorflow:uql_use_buckets: False INFO:tensorflow:nb_threads: 8 INFO:tensorflow:cp_nb_rlouts: 200 INFO:tensorflow:cpr_skip_last_layer: False INFO:tensorflow:cp_nb_iters_ft_ratio: 0.2 INFO:tensorflow:nb_smpls_val: 5000 INFO:tensorflow:ddpg_critic_width: 64 INFO:tensorflow:helpshort: False INFO:tensorflow:cpr_nb_smpls: 5000 INFO:tensorflow:cpr_skip_frst_layer: True INFO:tensorflow:ws_mask_update_step: 500.0 INFO:tensorflow:ws_save_path: ./models_ws/model.ckpt INFO:tensorflow:dcp_nb_stages: 3 INFO:tensorflow:cp_best_path: ./models/best_model.ckpt INFO:tensorflow:debug: False INFO:tensorflow:cp_nb_batches: 30 INFO:tensorflow:ddpg_critic_depth: 2 INFO:tensorflow:ws_nb_iters_feval: 25 INFO:tensorflow:nuql_opt_mode: weights INFO:tensorflow:nuql_nb_rlouts: 200 INFO:tensorflow:nuql_bucket_size: 256 INFO:tensorflow:cpr_lstsq_lrn_rate: 0.001 INFO:tensorflow:cpg_prune_ratio: 0.5 INFO:tensorflow:cpr_save_path_eval: ./models_cpr_eval/model.ckpt INFO:tensorflow:cp_nb_points_per_layer: 10 INFO:tensorflow:ddpg_noise_std_init: 1.0 INFO:tensorflow:nuql_enbl_random_layers: True INFO:tensorflow:nuql_weight_bits: 4 INFO:tensorflow:enbl_multi_gpu: False INFO:tensorflow:dcp_prune_ratio: 0.5 INFO:tensorflow:nuql_tune_disp_steps: 300 INFO:tensorflow:cp_nb_rlouts_min: 50 INFO:tensorflow:data_dir_hdfs: None INFO:tensorflow:ws_nb_rlouts_min: 50 INFO:tensorflow:help: False INFO:tensorflow:uqtf_enbl_manual_quant: False 2019-03-18 20:09:48.245403: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA 2019-03-18 20:09:48.420141: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:964] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2019-03-18 20:09:48.420727: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 0 with properties: name: GeForce RTX 2080 major: 7 minor: 5 memoryClockRate(GHz): 1.815 pciBusID: 0000:08:00.0 totalMemory: 7.77GiB freeMemory: 7.62GiB 2019-03-18 20:09:48.420744: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0 2019-03-18 20:09:48.752578: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix: 2019-03-18 20:09:48.752643: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988] 0 2019-03-18 20:09:48.752651: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0: N 2019-03-18 20:09:48.752874: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 7335 MB memory) -> physical GPU (device: 0, name: GeForce RTX 2080, pci bus id: 0000:08:00.0, compute capability: 7.5) WARNING:tensorflow:From /home/server9/lst/pocketFlow/datasets/abstract_dataset.py:86: parallel_interleave (from tensorflow.contrib.data.python.ops.interleave_ops) is deprecated and will be removed in a future version. Instructions for updating: Use
tf.data.experimental.parallel_interleave(...)
. WARNING:tensorflow:From /home/server9/lst/pocketFlow/datasets/abstract_dataset.py:107: shuffle_and_repeat (from tensorflow.contrib.data.python.ops.shuffle_ops) is deprecated and will be removed in a future version. Instructions for updating: Usetf.data.experimental.shuffle_and_repeat(...)
. INFO:tensorflow:Restoring parameters from ./models/original_model.ckpt INFO:tensorflow:model restored from ./models/original_model.ckpt 2019-03-18 20:09:49.840509: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0 2019-03-18 20:09:49.840608: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix: 2019-03-18 20:09:49.840616: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988] 0 2019-03-18 20:09:49.840622: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0: N 2019-03-18 20:09:49.840734: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 7335 MB memory) -> physical GPU (device: 0, name: GeForce RTX 2080, pci bus id: 0000:08:00.0, compute capability: 7.5) INFO:tensorflow:Restoring parameters from ./models/original_model.ckpt INFO:tensorflow:The current model flops is 82149376.0 INFO:tensorflow:The original model flops is 82149376.0 INFO:tensorflow:current conv model/resnet_model/conv2d_1/Conv2D INFO:tensorflow:current conv model/resnet_model/conv2d_2/Conv2D INFO:tensorflow:current conv model/resnet_model/conv2d_3/Conv2D INFO:tensorflow:current conv model/resnet_model/conv2d_5/Conv2D INFO:tensorflow:current conv model/resnet_model/conv2d_7/Conv2D INFO:tensorflow:current conv model/resnet_model/conv2d_10/Conv2D INFO:tensorflow:current conv model/resnet_model/conv2d_12/Conv2D INFO:tensorflow:current conv model/resnet_model/conv2d_14/Conv2D INFO:tensorflow:current conv model/resnet_model/conv2d_17/Conv2D INFO:tensorflow:current conv model/resnet_model/conv2d_19/Conv2D INFO:tensorflow:current conv model/resnet_model/conv2d_21/Conv2D [1.0, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 1.0, 0.25, 1.0, 0.25, 0.21875, 0.21875, 0.21875, 1.0, 0.5625, 1.0, 0.546875, 0.546875, 0.546875, 1] [1.0, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 1.0, 0.25, 1.0, 0.25, 0.21875, 0.21875, 0.21875, 1.0, 0.5625, 1.0, 0.546875, 0.546875, 0.546875, 1] deque([1.0, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 1.0, 0.25, 1.0, 0.25, 0.21875, 0.21875, 0.21875, 1.0, 0.5625, 1.0, 0.546875, 0.546875, 0.546875, 1]) INFO:tensorflow:current states: layer n c H W stride maxreduce layercomp 0 0.000000 0.25 0.046875 1.00 1.00 0.5 1.0 0.187500 1 0.047619 0.25 0.250000 1.00 1.00 0.5 1.0 0.111111 2 0.095238 0.25 0.250000 1.00 1.00 0.5 1.0 1.000000 3 0.142857 0.25 0.250000 1.00 1.00 0.5 1.0 1.000000 4 0.190476 0.25 0.250000 1.00 1.00 0.5 1.0 1.000000 5 0.238095 0.25 0.250000 1.00 1.00 0.5 1.0 1.000000 6 0.285714 0.25 0.250000 1.00 1.00 0.5 1.0 1.000000 7 0.333333 0.25 0.250000 1.00 1.00 0.5 1.0 1.000000 8 0.380952 0.50 0.250000 0.50 0.50 1.0 1.0 0.055556 9 0.428571 0.50 0.250000 0.50 0.50 1.0 1.0 0.500000 10 0.476190 0.50 0.500000 0.50 0.50 0.5 1.0 1.000000 11 0.523810 0.50 0.500000 0.50 0.50 0.5 1.0 1.000000 12 0.571429 0.50 0.500000 0.50 0.50 0.5 1.0 1.000000 13 0.619048 0.50 0.500000 0.50 0.50 0.5 1.0 1.000000 14 0.666667 0.50 0.500000 0.50 0.50 0.5 1.0 1.000000 15 0.714286 1.00 0.500000 0.25 0.25 1.0 1.0 0.055556 16 0.761905 1.00 0.500000 0.25 0.25 1.0 1.0 0.500000 17 0.809524 1.00 1.000000 0.25 0.25 0.5 1.0 1.000000 18 0.857143 1.00 1.000000 0.25 0.25 0.5 1.0 1.000000 19 0.904762 1.00 1.000000 0.25 0.25 0.5 1.0 1.000000 20 0.952381 1.00 1.000000 0.25 0.25 0.5 1.0 1.000000 21 1.000000 1.00 1.000000 0.25 0.25 0.5 1.0 1.000000 INFO:tensorflow:max_strategy_dict {'model/resnet_model/conv2d_19/Conv2D': [0.26413688858352186, 1.0], 'model/resnet_model/conv2d_9/Conv2D': [0.26413688858352186, 0.26413688858352186], 'model/resnet_model/conv2d/Conv2D': [1.0, 0.26413688858352186], 'model/resnet_model/conv2d_10/Conv2D': [0.26413688858352186, 1.0], 'model/resnet_model/conv2d_18/Conv2D': [0.26413688858352186, 0.26413688858352186], 'model/resnet_model/conv2d_1/Conv2D': [0.26413688858352186, 1.0], 'model/resnet_model/conv2d_4/Conv2D': [0.26413688858352186, 0.26413688858352186], 'model/resnet_model/conv2d_6/Conv2D': [0.26413688858352186, 0.26413688858352186], 'model/resnet_model/conv2d_17/Conv2D': [0.26413688858352186, 1.0], 'model/resnet_model/conv2d_8/Conv2D': [0.26413688858352186, 1.0], 'model/resnet_model/conv2d_5/Conv2D': [0.26413688858352186, 1.0], 'model/resnet_model/conv2d_11/Conv2D': [0.26413688858352186, 0.26413688858352186], 'model/resnet_model/conv2d_20/Conv2D': [0.26413688858352186, 0.26413688858352186], 'model/resnet_model/conv2d_2/Conv2D': [0.26413688858352186, 0.26413688858352186], 'model/resnet_model/conv2d_16/Conv2D': [0.26413688858352186, 0.26413688858352186], 'model/resnet_model/conv2d_14/Conv2D': [0.26413688858352186, 1.0], 'model/resnet_model/conv2d_3/Conv2D': [0.26413688858352186, 1.0], 'model/resnet_model/conv2d_13/Conv2D': [0.26413688858352186, 0.26413688858352186], 'model/resnet_model/conv2d_12/Conv2D': [0.26413688858352186, 1.0], 'model/resnet_model/conv2d_7/Conv2D': [0.26413688858352186, 1.0], 'model/resnet_model/conv2d_21/Conv2D': [1.0, 1.0], 'model/resnet_model/conv2d_15/Conv2D': [0.26413688858352186, 1.0]} INFO:tensorflow:Start pruning 2019-03-18 20:09:52.114174: E tensorflow/stream_executor/cuda/cuda_blas.cc:652] failed to run cuBLAS routine cublasSgemm_v2: CUBLAS_STATUS_EXECUTION_FAILED Traceback (most recent call last): File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1334, in _do_call return fn(*args) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1319, in _run_fn options, feed_dict, fetch_list, target_list, run_metadata) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1407, in _call_tf_sessionrun run_metadata) tensorflow.python.framework.errors_impl.InternalError: Blas SGEMM launch failed : m=131072, n=16, k=16 [[{{node model/resnet_model/conv2d_1/Conv2D}} = Conv2D[T=DT_FLOAT, data_format="NHWC", dilations=[1, 1, 1, 1], padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](model/resnet_model/Relu-0-0-TransposeNCHWToNHWC-LayoutOptimizer, model/resnet_model/conv2d_1/kernel/read)]] [[{{node model/resnet_model/conv2d_13/Conv2D/_231}} = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_402_model/resnet_model/conv2d_13/Conv2D", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "/home/server9/lst/pocketFlow/main.py", line 74, in
tf.app.run()
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/platform/app.py", line 125, in run
_sys.exit(main(argv))
File "/home/server9/lst/pocketFlow/main.py", line 60, in main
learner.train()
File "/home/server9/lst/pocketFlow/learners/channel_pruning/learner.py", line 150, in train
self.prune_and_finetune_list()
File "/home/server9/lst/pocketFlow/learners/channel_pruning/learner.py", line 578, in prune_and_finetune_list
done = self.prune_list_layers(queue, [FLAGS.cp_list_group])
File "/home/server9/lst/pocketFlow/learners/channel_pruning/learner.py", line 583, in prune_list_layers
done = self.prune_n_layers(p, queue)
File "/home/server9/lst/pocketFlow/learners/channel_pruning/learner.py", line 590, in prune_n_layers
self.pruner.extract_features()
File "/home/server9/lst/pocketFlow/learners/channel_pruning/channel_pruner.py", line 324, in extract_features
feats = self._model.sess.run(names, feed_dict={self.mem_images: data})
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 929, in run
run_metadata_ptr)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1152, in _run
feed_dict_tensor, options, run_metadata)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1328, in _do_run
run_metadata)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1348, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InternalError: Blas SGEMM launch failed : m=131072, n=16, k=16
[[node model/resnet_model/conv2d_1/Conv2D (defined at /home/server9/lst/pocketFlow/learners/channel_pruning/learner.py:162) = Conv2D[T=DT_FLOAT, data_format="NHWC", dilations=[1, 1, 1, 1], padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](model/resnet_model/Relu-0-0-TransposeNCHWToNHWC-LayoutOptimizer, model/resnet_model/conv2d_1/kernel/read)]]
[[{{node model/resnet_model/conv2d_13/Conv2D/_231}} = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_402_model/resnet_model/conv2d_13/Conv2D", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
Caused by op 'model/resnet_model/conv2d_1/Conv2D', defined at: File "/home/server9/lst/pocketFlow/main.py", line 74, in
tf.app.run()
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/platform/app.py", line 125, in run
_sys.exit(main(argv))
File "/home/server9/lst/pocketFlow/main.py", line 60, in main
learner.train()
File "/home/server9/lst/pocketFlow/learners/channel_pruning/learner.py", line 141, in train
self.create_pruner()
File "/home/server9/lst/pocketFlow/learners/channel_pruning/learner.py", line 162, in create_pruner
self.saver = tf.train.import_meta_graph(FLAGS.cp_original_path + '.meta')
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py", line 1674, in import_meta_graph
meta_graph_or_file, clear_devices, import_scope, kwargs)[0]
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py", line 1696, in _import_meta_graph_with_return_elements
kwargs))
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/meta_graph.py", line 806, in import_scoped_meta_graph_with_return_elements
return_elements=return_elements)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
return func(*args, **kwargs)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/importer.py", line 442, in import_graph_def
_ProcessNewOps(graph)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/importer.py", line 234, in _ProcessNewOps
for new_op in graph._add_new_tf_operations(compute_devices=False): # pylint: disable=protected-access
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 3440, in _add_new_tf_operations
for c_op in c_api_util.new_tf_operations(self)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 3440, in
for c_op in c_api_util.new_tf_operations(self)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 3299, in _create_op_from_tf_operation
ret = Operation(c_op, self)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 1770, in init
self._traceback = tf_stack.extract_stack()
InternalError (see above for traceback): Blas SGEMM launch failed : m=131072, n=16, k=16 [[node model/resnet_model/conv2d_1/Conv2D (defined at /home/server9/lst/pocketFlow/learners/channel_pruning/learner.py:162) = Conv2D[T=DT_FLOAT, data_format="NHWC", dilations=[1, 1, 1, 1], padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](model/resnet_model/Relu-0-0-TransposeNCHWToNHWC-LayoutOptimizer, model/resnet_model/conv2d_1/kernel/read)]] [[{{node model/resnet_model/conv2d_13/Conv2D/_231}} = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_402_model/resnet_model/conv2d_13/Conv2D", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
Process finished with exit code 1