zhangjun commented 2 years ago

IR

zhangjun commented 2 years ago

Graph

zhangjun commented 2 years ago

Executor

zhangjun commented 2 years ago

phi

core

kernel using KernelFn = std::function<void(KernelContext* ctx)>; https://github.com/PaddlePaddle/Paddle/blob/3af98de560659956c98882e37d039b1349b4d0c2/paddle/phi/core/kernel_factory.h#L211

class Kernel {
public:
// for map element contruct
Kernel() = default;

explicit Kernel(KernelFn fn, void* variadic_fn)
   : fn_(fn), variadic_fn_(variadic_fn) {}

void operator()(KernelContext* ctx) const { fn_(ctx); }
private:
KernelFn fn_{nullptr};
void* variadic_fn_ = nullptr;
KernelArgsDef args_def_;
};

kernel factory

class KernelFactory {
public:
static KernelFactory& Instance();

KernelNameMap& kernels() { return kernels_; }
private:
KernelFactory() = default;

KernelNameMap kernels_;
};

kernel context

zhangjun commented 2 years ago

paddle 执行报错

FatalError: Segmentation fault is detected by the operating system. https://github.com/PaddlePaddle/Paddle/issues/36281 https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/disable_signal_handler_cn.html#disable-signal-handler

zhangjun commented 1 year ago

paddle distributed

distributed
|-- CMakeLists.txt
|-- auto_parallel
|   |-- CMakeLists.txt
|   |-- auto_parallel.proto
|   |-- device_mesh.cc
|   |-- device_mesh.h
|   |-- dist_attr.cc
|   |-- dist_attr.h                                     **TensorDistAttr**
|   |-- dist_mapper.cc                               **DistributedMapper**
|   |-- dist_mapper.h
|   |-- process_mesh.cc
|   |-- process_mesh.h
|   `-- utils.h
|-- collective
|   |-- ...
|   |-- CMakeLists.txt
|   |-- NCCLTools.cc
|   |-- NCCLTools.h
|   |-- ProcessGroup.cc
|   |-- ProcessGroup.h
|   |-- reducer.cc                                   **EagerReducer**
|   `-- reducer.h
|-- common
|   `-- ...
|-- fleet_executor
|   |-- CMakeLists.txt
|   |-- amplifier_interceptor.cc
|   |-- amplifier_interceptor.h
|   |-- carrier.cc                              **Carrier**(Source, Compute, Amplifier, Sink  -- TaskNode)
|   |-- carrier.h
|   |-- compute_interceptor.cc
|   |-- compute_interceptor.h
|   |-- dist_model.cc                             DistModel <- FleetExecutor
|   |-- dist_model.h
|   |-- dist_model_tensor_wrapper.cc
|   |-- dist_model_tensor_wrapper.h
|   |-- fleet_executor.cc                    FleetExecutor  <- RuntimeGraph, MessageBus, TaskNode, Carrier
|   |-- fleet_executor.h
|   |-- fleet_executor_desc.proto
|   |-- global.h
|   |-- interceptor.cc
|   |-- interceptor.h
|   |-- interceptor_message.proto
|   |-- message_bus.cc
|   |-- message_bus.h
|   |-- message_service.cc
|   |-- message_service.h
|   |-- runtime_graph.cc                        RuntimeGraph  <- TaskNode
|   |-- runtime_graph.h
|   |-- sink_interceptor.cc
|   |-- sink_interceptor.h
|   |-- source_interceptor.cc
|   |-- source_interceptor.h
|   |-- task_loop.cc
|   |-- task_loop.h
|   |-- task_loop_thread.cc
|   |-- task_loop_thread.h
|   |-- task_loop_thread_pool.cc
|   |-- task_loop_thread_pool.h
|   |-- task_node.cc
|   |-- task_node.h
|   `-- task_node.h
|-- index_dataset
|   |-- CMakeLists.txt
|   |-- index_dataset.proto
|   |-- index_sampler.cc                          **IndexSampler**
|   |-- index_sampler.h
|   |-- index_wrapper.cc
|   `-- index_wrapper.h
|-- ps
|   |-- CMakeLists.txt
|   |-- README.md
|   |-- service
|   |   |-- CMakeLists.txt
|   |   |-- README.md
|   |   |-- brpc_ps_client.cc
|   |   |-- brpc_ps_client.h
|   |   |-- brpc_ps_server.cc
|   |   |-- brpc_ps_server.h
|   |   |-- brpc_utils.cc
|   |   |-- brpc_utils.h
|   |   |-- communicator
|   |   |   |-- communicator.cc
|   |   |   |-- communicator.h
|   |   |   `-- communicator_common.h
|   |   |-- coordinator_client.cc
|   |   |-- coordinator_client.h
|   |   |-- env.cc
|   |   |-- env.h
|   |   |-- graph_brpc_client.cc
|   |   |-- graph_brpc_client.h
|   |   |-- graph_brpc_server.cc
|   |   |-- graph_brpc_server.h
|   |   |-- heter_client.cc
|   |   |-- heter_client.h
|   |   |-- heter_server.cc
|   |   |-- heter_server.h
|   |   |-- ps_client.cc
|   |   |-- ps_client.h
|   |   |-- ps_local_client.cc
|   |   |-- ps_local_client.h
|   |   |-- ps_local_server.h
|   |   |-- ps_service
|   |   |   |-- graph_py_service.cc
|   |   |   |-- graph_py_service.h
|   |   |   |-- service.cc
|   |   |   `-- service.h
|   |   |-- sendrecv.proto
|   |   |-- server.cc
|   |   `-- server.h
|   |-- table
|   |   |-- CMakeLists.txt
|   |   |-- accessor.h
|   |   |-- barrier_table.cc
|   |   |-- common_graph_table.cc
|   |   |-- common_graph_table.h
|   |   |-- common_table.h
|   |   |-- ctr_accessor.cc
|   |   |-- ctr_accessor.h
|   |   |-- ctr_double_accessor.cc
|   |   |-- ctr_double_accessor.h
|   |   |-- ctr_dymf_accessor.cc
|   |   |-- ctr_dymf_accessor.h
|   |   |-- depends
|   |   |   |-- dense.h
|   |   |   |-- feature_value.h
|   |   |   |-- geo_recorder.h
|   |   |   |-- initializers.h
|   |   |   |-- rocksdb_warpper.h
|   |   |   `-- sparse_utils.h
|   |   |-- graph
|   |   |   |-- class_macro.h
|   |   |   |-- graph_edge.cc
|   |   |   |-- graph_edge.h
|   |   |   |-- graph_node.cc
|   |   |   |-- graph_node.h
|   |   |   |-- graph_weighted_sampler.cc
|   |   |   `-- graph_weighted_sampler.h
|   |   |-- memory_dense_table.cc
|   |   |-- memory_dense_table.h
|   |   |-- memory_sparse_geo_table.cc
|   |   |-- memory_sparse_geo_table.h
|   |   |-- memory_sparse_table.cc
|   |   |-- memory_sparse_table.h
|   |   |-- sparse_accessor.cc
|   |   |-- sparse_accessor.h
|   |   |-- sparse_sgd_rule.cc
|   |   |-- sparse_sgd_rule.h
|   |   |-- ssd_sparse_table.cc
|   |   |-- ssd_sparse_table.h
|   |   |-- table.cc
|   |   |-- table.h
|   |   |-- tensor_accessor.cc
|   |   |-- tensor_accessor.h
|   |   |-- tensor_table.cc
|   |   `-- tensor_table.h
|   |-- thirdparty
|   |   `-- round_robin.h
|   `-- wrapper
|       |-- CMakeLists.txt
|       |-- fleet.cc
|       |-- fleet.h
|       |-- ps_cpu_wrapper.h
|       |-- ps_gpu_wrapper.h
|       |-- ps_heter_wrapper.h
|       `-- ps_wrapper.h
|-- store
|   |-- CMakeLists.txt
|   |-- socket.cpp
|   |-- socket.h
|   |-- store.h
|   |-- tcp_store.cc
|   |-- tcp_store.h
|   |-- tcp_utils.cc
|   |-- tcp_utils.h
|   `-- test_tcp_store.cc
`-- the_one_ps.proto

zhangjun commented 1 year ago

phi

api gen

paddle/phi/api/yaml/generator/parse_api.py --api_yaml_path ./ops.yaml --output_path ./parsed_apis/api.parsed.yaml

paddle/phi/api/yaml/generator/parse_api.py --api_yaml_path ./legacy_ops.yaml --output_path ./parsed_apis/legacy_api.parsed.yaml

generator/cross_validate.py --forward_yaml_paths
    ./parsed_apis/api.parsed.yaml ./parsed_apis/legacy_api.parsed.yaml
    --backward_yaml_paths ./parsed_apis/backward_api.parsed.yaml
    ./parsed_apis/legacy_backward_api.parsed.yaml

generator/generate_op.py --api_yaml_path
    ./parsed_apis/api.parsed.yaml --backward_api_yaml_path
    ./parsed_apis/backward_api.parsed.yaml --api_version_yaml_path
    op_version.yaml --op_compat_yaml_path op_compat.yaml --output_op_path
    "${generated_op_path}.tmp" --output_arg_map_path
    "${generated_argument_mapping_path}.tmp"

paddle/phi/api/yaml/generator/api_gen.py --api_yaml_path ${api_yaml_file}
    ${legacy_api_yaml_file} --api_header_path ${api_header_file_tmp}
    --api_header_path ${api_header_file_tmp} --api_source_path
    ${api_source_file_tmp}

    ${PYTHON_EXECUTABLE} ${wrapped_infermeta_gen_file} --api_yaml_path
    ${api_yaml_file} ${legacy_api_yaml_file} --wrapped_infermeta_header_path
    ${wrapped_infermeta_header_file} --wrapped_infermeta_source_path
    ${wrapped_infermeta_source_file}

zhangjun commented 1 year ago

Operator

zhangjun commented 1 year ago

Paddle 静态图设计思想

1、Program

将神经网络描述为Program数据结构。 Program由block组成，Program = List[Block]。 Block由Operator和Variable组成。即Block = List[Operator] + List[Variable]。

与编程语言类比，我们可以将Program理解为程序，Block对应程序的控制流分支结构，如条件分支、循环分支等。静态图的控制流Op（conditional_block，while，recurrent等）均通过Block表达。

不同Block里的变量可以重名。若父Block与子Block中存在同名变量，那么子Block的Operator运行时会优先找到子Block中的变量。

组建神经网络的过程中会涉及两个Program，即startup program和main program。 startup program对应TensorFlow中的tf.global_initializer()，包含参数、learning rate、Optimizer Momentum等变量的初始化Op。

main program对应神经网络的主体结构。因此，在运行神经网络训练/预测时，我们需首先跑一次startup program进行初始化，然后跑多次main program进行训练/预测。

Python端的Program，Block，Operator，Variable分别对应于C++端的ProgramDesc，BlockDesc，OpDesc，VarDesc。 Program是对神经网络的静态描述，其底层是Protobuf Description，因此在运行网络以前所有变量、Op均不存在。

2、Place

Place表示设备，可以是GPU设备或CPU设备。 using Place = boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace>; boost::variant类似于C++的union，是一种类型安全的union，即multi-type, one-value。

同一设备的内存/显存的Place相同，即相同Place的Tensor的内存/显存空间在同一设备上。

3、DeviceContext

DeviceContext表示一个设备，是一个虚拟设备的概念。理论上，一个Place可以对应多个DeviceContext。DeviceContext包含和设备有关的额外信息，如cudaStream_t, cudnnHolder_t, cublasHandle_t等。

静态图设计中维护一个全局的DeviceContextPool，记录Place到DeviceContext的map映射。

4、Variable

一个类似std::any的结构，可以存储任意类型变量。常见存储Tensor，还可以存储SelectedRows、ReaderHolder。

5、Scope

Scope用于存储变量，主要数据成员如下：

class Scope {
  std::unordered_map<std::string, std::unique_ptr<Variable>> vars_; // Scope中存储的变量
  const Scope *parent_; // 父Scope
  std::list<Scope *> kids_; // 子Scope
};

Scope与编程语言的作用域类似，当调用Scope::FindVar时，会首先在当前Scope中查找变量是否存在，若存在则直接返回，否则递归地从父Scope里寻找该变量。

6、Operator

OP

OP主要包括4个信息：

type: std::string类型，表示Op的名称，如"matmul"，"conv2d"，"reshape"等。
inputs: std::map<std::string, std::vector>类型，表示Op的输入变量，map的key为slot名称，对应于OpProtoAndCheckerMaker中定义的名称；map的value为实际的变量名，对应于Scope中的变量名。
outputs: std::map<std::string, std::vector>类型，表示Op的输出变量。key和value的含义与inputs类似。
attributes: std::map<std::string, boost::variant<...>>类型，表示Op的属性，例如transpose选择哪些维度进行转置等。

OperatorBase

OperatorBase是所有Op的基类，其Run方法的声明为：

void OperatorBase::Run(const Scope &scope, const platform::Place& place) {...}

运行Op时，需指明Scope和Place。Op运行过程中，会首先从Scope中获取输入输出变量，然后从Place中获取设备信息，进行计算。

OpKernel

OperatorWithKernel继承自OperatorBase，我们称继承自OperatorWithKernel的Op为有Kernel的Op。

Kernel的目的是为了区分不同的运行设备（CPU/GPU）、数据类型（float/double/int）、库（MKLDNN/CUDNN）、layout（NCHW/NHWC）等。

一个Op可以有多个Kernel实现，Kernel实现应继承自OpKernel。

OperatorWithKernel重写了OperatorBase的RunImpl方法，进行了以下操作：

根据inputs和outputs，从Scope中找出所有输入输出变量，形成map<string, vector<Variable *>>，构造出ExecutionContext。
根据inputs Tensor的设备、layout等信息，判断是否需要对Tensor进行设备转换、Layout转换等。例如，若前一个Op的输出Tensor的CPU上，当前Op需要运行在GPU上，需要将当前Op的输入Tensor copy到GPU上。在转换过程中，会从当前Scope中new一个新的Scope，并在新Scope中创建同名变量进行Transfer。
调用OperatorWithKernel::InferShape方法推导输出变量的shape信息。
根据inputs Tensor的设备、layout、数据类型等信息，从所有的Kernel中选择合适的Kernel，将ExecutionContext传入OpKernel::Compute方法进行计算。

7、编译过程

8、运行过程

当Python端的Program构建完毕后，Executor::Run会取出Program的Block 0中的所有OpDesc，调用OpRegistry::CreateOp方法根据OpDesc创建OperatorBase，然后调用OperatorBase::Run()方法运行所有Op，具体方式为：

void Executor::Run(const ProgramDesc &program, const Scope &scope, const platform::Place &place) {
    std::vector<std::unique_ptr<OperatorBase>> ops;

    for (auto &op_desc : program.Block(0).AllOps()) {
        auto op = OpRegistry::CreateOp(op_desc);
        ops.emplace_back(std::move(op));
    }

    for (auto &op : ops) {
        op->Run(scope, place);
    }
}

zhangjun commented 1 year ago

operator.cc

using VariableNameMap = std::map<std::string, std::vector<std::string>>;
using VariableValueMap = std::map<std::string, std::vector<Variable*>>;
using Attribute = paddle::variant<paddle::blank,
                                  int,
                                  float,
                                  std::string,
                                  std::vector<int>,
                                  std::vector<float>,
                                  std::vector<std::string>,
                                  bool,
                                  std::vector<bool>,
                                  BlockDesc*,
                                  int64_t,
                                  std::vector<BlockDesc*>,
                                  std::vector<int64_t>,
                                  std::vector<double>,
                                  VarDesc*,
                                  std::vector<VarDesc*>,
                                  double>;
using AttributeMap = std::unordered_map<std::string, Attribute>;
using OpCreator =
    std::function<OperatorBase*(const std::string& /*type*/,
                                const VariableNameMap& /*inputs*/,
                                const VariableNameMap& /*outputs*/,
                                const AttributeMap& /*attrs*/)>;

class RuntimeContext {
 public:
  RuntimeContext(const VariableNameMap& innames,
                 const VariableNameMap& outnames,
                 const Scope& scope);

  RuntimeContext(const VariableValueMap& invars,
                 const VariableValueMap& outvars)
      : inputs(invars), outputs(outvars) {}

  VariableValueMap inputs;
  VariableValueMap outputs;
};

 /* User
 * should always construct a proto message OpDesc and call
 * OpRegistry::CreateOp(op_desc) to get an Operator instance.
 */
class OperatorBase {
 public:
  OperatorBase(const std::string& type,
               const VariableNameMap& inputs,
               const VariableNameMap& outputs,
               const AttributeMap& attrs);

  virtual ~OperatorBase() {}

  /// Executor will call this interface function to Run an op.
  //  The implementation should be written at RunImpl
  void Run(const Scope& scope, const platform::Place& place);

  // FIXME(typhoonzero): this is only used for recv_op to stop event_loop.
  virtual void Stop() {}

  virtual bool SupportGPU() const { return false; }

  const std::string& Type() const { return type_; }

  bool HasAttr(const std::string& name) const {
    return attrs_.count(name) || runtime_attrs_.count(name);
  }
  template <typename T>
  inline const T& Attr(const std::string& name) const {
    auto it = attrs_.find(name);
    if (it == attrs_.end()) {
      it = runtime_attrs_.find(name);
    }
    return PADDLE_GET_CONST(T, it->second);
  }
  void SetAttr(const std::string& name, const Attribute& v) {
    attrs_[name] = v;
  }
  const AttributeMap& Attrs() const { return attrs_; }
  const AttributeMap& RuntimeAttrs() const { return runtime_attrs_; }
  void SetRuntimeAttributeMap(const AttributeMap& runtime_attrs) {
    runtime_attrs_ = runtime_attrs;
  }

  const VariableNameMap& Inputs() const { return inputs_; }
  const VariableNameMap& Outputs() const { return outputs_; }
  VariableNameMap& Inputs() { return inputs_; }
  VariableNameMap& Outputs() { return outputs_; }

  const OpInfo& Info() const {
    return *info_;
  }

  bool HasInputs(const std::string& name) const;
  //! Get a input with argument's name described in `op_proto`
  std::string Input(const std::string& name) const;
  //! Get a input which has multiple variables.
  const std::vector<std::string>& Inputs(const std::string& name) const;
  //! Get all inputs variable names
  std::vector<std::string> InputVars() const;

  bool HasOutputs(const std::string& name) const;
  //! Get a output with argument's name described in `op_proto`
  std::string Output(const std::string& name) const;
  //! Get an output which has multiple variables.
  //! TODO add a vector_view to prevent memory copy.
  const std::vector<std::string>& Outputs(const std::string& name) const;
  //! Get all outputs variable names
  virtual std::vector<std::string> OutputVars(bool has_intermediate) const;

  void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; }

  virtual void RuntimeInferShape(const Scope& scope,
                                 const platform::Place& place,
                                 const RuntimeContext& ctx) const {}

  virtual platform::Place GetExecutionPlace(
      const platform::Place& place) const {
    return place;
  }

  uint64_t Id() const { return id_; }

  void SetId(uint64_t id) { id_ = id; }

 protected:
  std::string type_;
  VariableNameMap inputs_;

  VariableNameMap outputs_;
  AttributeMap attrs_;
  // NOTE: runtime_attrs_ contains the attributes which used for dispatching
  // kernel (use_mkldnn, use_cudnn, ...) or passing additional configuration
  // for special heterogeneous kernel (workspace_size_MB, ...).
  // The attributes in runtime_attrs_ are setted by framework (such as PASS),
  // and not in the python api.
  AttributeMap runtime_attrs_;

  // OpInfo
  const OpInfo* info_;

  // OpDesc Id
  uint64_t id_ = UINT64_MAX;

  // Whether this operator executes in an Executor.
  bool run_by_executor_{true};

 private:
  void GenerateTemporaryNames();
  void CheckAllInputOutputSet() const;
  virtual void RunImpl(const Scope& scope,
                       const platform::Place& place) const = 0;
};

基础组件

varant.h

zhangjun / zhangjun.github.io

Paddle #10

IR

Graph

Executor

phi

core

paddle 执行报错

paddle distributed

phi

api gen

Operator

Paddle 静态图设计思想

1、Program

2、Place

3、DeviceContext

4、Variable

5、Scope

6、Operator

OP

OperatorBase

OpKernel

7、编译过程

8、运行过程

operator.cc

基础组件