AbsaOSS / spline-spark-agent

Spline agent for Apache Spark
https://absaoss.github.io/spline/
Apache License 2.0
176 stars 92 forks source link

Executing sparksession.sql in the spark jar will result in inaccurate reads collection #651

Closed jinmu0410 closed 1 year ago

jinmu0410 commented 1 year ago

this code 截屏2023-04-14 10 58 53

this spline json

    {
        "id":"8bdf45c2-449f-56a0-b82a-6b0e662e4005",
        "name":"offline-dev-task-LXKF001114",
        "labels":{

        },
        "operations":{
            "write":{
                "outputSource":"hdfs://lake-node3:8020/user/hive/warehouse/dwd.db/dwd_wurui_lxkf_company_info",
                "append":false,
                "id":"op-0",
                "name":"SaveIntoDataSourceCommand",
                "childIds":[
                    "op-1"
                ],
                "params":{
                    "path":"/user/hive/warehouse/dwd.db/dwd_wurui_lxkf_company_info"
                },
                "extra":{
                    "destinationType":"delta"
                }
            },
            "reads":[
                {
                    "inputSources":[
                        "hdfs://lake-node3:8020/user/hive/warehouse/ods.db/ods_wurui_company_info_50/part-00000-fac8fd54-a725-44c1-b46c-aea61339223c-c000.snappy.parquet"
                    ],
                    "id":"op-13",
                    "name":"FileScanRDD",
                    "output":[

                    ],
                    "params":{

                    },
                    "extra":{

                    }
                }
            ],
            "other":[
                {
                    "id":"op-12",
                    "name":"MapPartitionsRDD",
                    "childIds":[
                        "op-13"
                    ],
                    "output":[

                    ],
                    "params":{

                    },
                    "extra":{

                    }
                },
                {
                    "id":"op-11",
                    "name":"MapPartitionsRDD",
                    "childIds":[
                        "op-12"
                    ],
                    "output":[

                    ],
                    "params":{

                    },
                    "extra":{

                    }
                },
                {
                    "id":"op-10",
                    "name":"MapPartitionsRDD",
                    "childIds":[
                        "op-11"
                    ],
                    "output":[

                    ],
                    "params":{

                    },
                    "extra":{

                    }
                },
                {
                    "id":"op-9",
                    "name":"SQLExecutionRDD",
                    "childIds":[
                        "op-10"
                    ],
                    "output":[

                    ],
                    "params":{

                    },
                    "extra":{

                    }
                },
                {
                    "id":"op-8",
                    "name":"MapPartitionsRDD",
                    "childIds":[
                        "op-9"
                    ],
                    "output":[

                    ],
                    "params":{

                    },
                    "extra":{

                    }
                },
                {
                    "id":"op-7",
                    "name":"MapPartitionsRDD",
                    "childIds":[
                        "op-8"
                    ],
                    "output":[

                    ],
                    "params":{

                    },
                    "extra":{

                    }
                },
                {
                    "id":"op-6",
                    "name":"MapPartitionsRDD",
                    "childIds":[
                        "op-7"
                    ],
                    "output":[

                    ],
                    "params":{

                    },
                    "extra":{

                    }
                },
                {
                    "id":"op-5",
                    "name":"MapPartitionsRDD",
                    "childIds":[
                        "op-6"
                    ],
                    "output":[

                    ],
                    "params":{

                    },
                    "extra":{

                    }
                },
                {
                    "id":"op-4",
                    "name":"LogicalRDD",
                    "childIds":[
                        "op-5"
                    ],
                    "output":[
                        "attr-0",
                        "attr-1",
                        "attr-2",
                        "attr-3",
                        "attr-4",
                        "attr-5",
                        "attr-6",
                        "attr-7",
                        "attr-8"
                    ],
                    "params":{
                        "rdd":"MapPartitionsRDD[30] at createDataFrame at UniqueIdUdf.scala:15",
                        "isStreaming":false,
                        "outputPartitioning":"UnknownPartitioning(0)"
                    },
                    "extra":{

                    }
                },
                {
                    "id":"op-3",
                    "name":"Project",
                    "childIds":[
                        "op-4"
                    ],
                    "output":[
                        "attr-0",
                        "attr-1",
                        "attr-2",
                        "attr-3",
                        "attr-4",
                        "attr-5",
                        "attr-6",
                        "attr-7",
                        "attr-8",
                        "attr-9"
                    ],
                    "params":{
                        "projectList":[
                            {
                                "id":"attr-0"
                            },
                            {
                                "id":"attr-1"
                            },
                            {
                                "id":"attr-2"
                            },
                            {
                                "id":"attr-3"
                            },
                            {
                                "id":"attr-4"
                            },
                            {
                                "id":"attr-5"
                            },
                            {
                                "id":"attr-6"
                            },
                            {
                                "id":"attr-7"
                            },
                            {
                                "id":"attr-8"
                            },
                            {
                                "id":"expr-0"
                            }
                        ]
                    },
                    "extra":{

                    }
                },
                {
                    "id":"op-2",
                    "name":"Project",
                    "childIds":[
                        "op-3"
                    ],
                    "output":[
                        "attr-0",
                        "attr-1",
                        "attr-2",
                        "attr-3",
                        "attr-4",
                        "attr-5",
                        "attr-6",
                        "attr-7",
                        "attr-9"
                    ],
                    "params":{
                        "projectList":[
                            {
                                "id":"attr-0"
                            },
                            {
                                "id":"attr-1"
                            },
                            {
                                "id":"attr-2"
                            },
                            {
                                "id":"attr-3"
                            },
                            {
                                "id":"attr-4"
                            },
                            {
                                "id":"attr-5"
                            },
                            {
                                "id":"attr-6"
                            },
                            {
                                "id":"attr-7"
                            },
                            {
                                "id":"attr-9"
                            }
                        ]
                    },
                    "extra":{

                    }
                },
                {
                    "id":"op-1",
                    "name":"Project",
                    "childIds":[
                        "op-2"
                    ],
                    "output":[
                        "attr-0",
                        "attr-1",
                        "attr-2",
                        "attr-3",
                        "attr-4",
                        "attr-5",
                        "attr-6",
                        "attr-7",
                        "attr-9",
                        "attr-10"
                    ],
                    "params":{
                        "projectList":[
                            {
                                "id":"attr-0"
                            },
                            {
                                "id":"attr-1"
                            },
                            {
                                "id":"attr-2"
                            },
                            {
                                "id":"attr-3"
                            },
                            {
                                "id":"attr-4"
                            },
                            {
                                "id":"attr-5"
                            },
                            {
                                "id":"attr-6"
                            },
                            {
                                "id":"attr-7"
                            },
                            {
                                "id":"attr-9"
                            },
                            {
                                "id":"expr-5"
                            }
                        ]
                    },
                    "extra":{

                    }
                }
            ]
        },
        "attributes":[
            {
                "id":"attr-0",
                "dataType":"e63adadc-648a-56a0-9424-3289858cf0bb",
                "childRefs":[

                ],
                "extra":{

                },
                "name":"company_name"
            },
            {
                "id":"attr-1",
                "dataType":"e63adadc-648a-56a0-9424-3289858cf0bb",
                "childRefs":[

                ],
                "extra":{

                },
                "name":"address"
            },
            {
                "id":"attr-2",
                "dataType":"e63adadc-648a-56a0-9424-3289858cf0bb",
                "childRefs":[

                ],
                "extra":{

                },
                "name":"address_city"
            },
            {
                "id":"attr-3",
                "dataType":"e63adadc-648a-56a0-9424-3289858cf0bb",
                "childRefs":[

                ],
                "extra":{

                },
                "name":"ds_id"
            },
            {
                "id":"attr-4",
                "dataType":"e63adadc-648a-56a0-9424-3289858cf0bb",
                "childRefs":[

                ],
                "extra":{

                },
                "name":"address_province"
            },
            {
                "id":"attr-5",
                "dataType":"e63adadc-648a-56a0-9424-3289858cf0bb",
                "childRefs":[

                ],
                "extra":{

                },
                "name":"address_area"
            },
            {
                "id":"attr-6",
                "dataType":"75fe27b9-9a00-5c7d-966f-33ba32333133",
                "childRefs":[

                ],
                "extra":{

                },
                "name":"insert_time"
            },
            {
                "id":"attr-7",
                "dataType":"a155e715-56ab-59c4-a94b-ed1851a6984a",
                "childRefs":[

                ],
                "extra":{

                },
                "name":"quality_filter_result"
            },
            {
                "id":"attr-8",
                "dataType":"ab4da308-91fb-550a-a5e4-beddecff2a2b",
                "childRefs":[

                ],
                "extra":{

                },
                "name":"supconit_id_tmp"
            },
            {
                "id":"attr-9",
                "dataType":"e63adadc-648a-56a0-9424-3289858cf0bb",
                "childRefs":[
                    {
                        "id":"expr-0"
                    }
                ],
                "extra":{

                },
                "name":"supconit_id"
            },
            {
                "id":"attr-10",
                "dataType":"ba7ef708-332f-54fd-a671-c91d13ae6f8e",
                "childRefs":[
                    {
                        "id":"expr-5"
                    }
                ],
                "extra":{

                },
                "name":"is_delete"
            }
        ],
        "expressions":{
            "functions":[
                {
                    "id":"expr-3",
                    "dataType":"455d9d5b-7620-529e-840b-897cee45e560",
                    "childRefs":[
                        {
                            "id":"attr-6"
                        }
                    ],
                    "extra":{
                        "simpleClassName":"Cast",
                        "_typeHint":"expr.Generic"
                    },
                    "name":"cast",
                    "params":{
                        "timeZoneId":"Asia/Shanghai",
                        "ansiEnabled":false
                    }
                },
                {
                    "id":"expr-2",
                    "dataType":"a155e715-56ab-59c4-a94b-ed1851a6984a",
                    "childRefs":[
                        {
                            "id":"expr-3"
                        }
                    ],
                    "extra":{
                        "simpleClassName":"Cast",
                        "_typeHint":"expr.Generic"
                    },
                    "name":"cast",
                    "params":{
                        "timeZoneId":"Asia/Shanghai",
                        "ansiEnabled":false
                    }
                },
                {
                    "id":"expr-4",
                    "dataType":"e63adadc-648a-56a0-9424-3289858cf0bb",
                    "childRefs":[
                        {
                            "id":"attr-8"
                        }
                    ],
                    "extra":{
                        "simpleClassName":"Cast",
                        "_typeHint":"expr.Generic"
                    },
                    "name":"cast",
                    "params":{
                        "timeZoneId":"Asia/Shanghai",
                        "ansiEnabled":false
                    }
                },
                {
                    "id":"expr-1",
                    "dataType":"e63adadc-648a-56a0-9424-3289858cf0bb",
                    "childRefs":[
                        {
                            "id":"expr-2"
                        },
                        {
                            "id":"expr-4"
                        }
                    ],
                    "extra":{
                        "simpleClassName":"Concat",
                        "_typeHint":"expr.Generic"
                    },
                    "name":"concat",
                    "params":{

                    }
                },
                {
                    "id":"expr-0",
                    "dataType":"e63adadc-648a-56a0-9424-3289858cf0bb",
                    "childRefs":[
                        {
                            "id":"expr-1"
                        }
                    ],
                    "extra":{
                        "simpleClassName":"Alias",
                        "_typeHint":"expr.Alias"
                    },
                    "name":"supconit_id",
                    "params":{
                        "name":"supconit_id",
                        "nonInheritableMetadataKeys":[
                            "__dataset_id",
                            "__col_position"
                        ]
                    }
                },
                {
                    "id":"expr-5",
                    "dataType":"ba7ef708-332f-54fd-a671-c91d13ae6f8e",
                    "childRefs":[
                        {
                            "id":"expr-6"
                        }
                    ],
                    "extra":{
                        "simpleClassName":"Alias",
                        "_typeHint":"expr.Alias"
                    },
                    "name":"is_delete",
                    "params":{
                        "name":"is_delete",
                        "nonInheritableMetadataKeys":[
                            "__dataset_id",
                            "__col_position"
                        ]
                    }
                }
            ],
            "constants":[
                {
                    "id":"expr-6",
                    "dataType":"ba7ef708-332f-54fd-a671-c91d13ae6f8e",
                    "extra":{
                        "simpleClassName":"Literal",
                        "_typeHint":"expr.Literal"
                    },
                    "value":0
                }
            ]
        },
        "systemInfo":{
            "name":"spark",
            "version":"3.3.1"
        },
        "agentInfo":{
            "name":"spline",
            "version":"1.1.0-SNAPSHOT+845a798"
        },
        "extraInfo":{
            "appName":"offline-dev-task-LXKF001114",
            "dataTypes":[
                {
                    "_typeHint":"dt.Simple",
                    "id":"e63adadc-648a-56a0-9424-3289858cf0bb",
                    "name":"string",
                    "nullable":true
                },
                {
                    "_typeHint":"dt.Simple",
                    "id":"75fe27b9-9a00-5c7d-966f-33ba32333133",
                    "name":"timestamp",
                    "nullable":false
                },
                {
                    "_typeHint":"dt.Simple",
                    "id":"a155e715-56ab-59c4-a94b-ed1851a6984a",
                    "name":"string",
                    "nullable":false
                },
                {
                    "_typeHint":"dt.Simple",
                    "id":"ab4da308-91fb-550a-a5e4-beddecff2a2b",
                    "name":"bigint",
                    "nullable":true
                },
                {
                    "_typeHint":"dt.Simple",
                    "id":"455d9d5b-7620-529e-840b-897cee45e560",
                    "name":"bigint",
                    "nullable":false
                },
                {
                    "_typeHint":"dt.Simple",
                    "id":"ba7ef708-332f-54fd-a671-c91d13ae6f8e",
                    "name":"int",
                    "nullable":false
                }
            ]
        }
    }

question

            "reads":[
                {
                    "inputSources":[
                        "hdfs://lake-node3:8020/user/hive/warehouse/ods.db/ods_wurui_company_info_50/part-00000-fac8fd54-a725-44c1-b46c-aea61339223c-c000.snappy.parquet"
                    ],
                    "id":"op-13",
                    "name":"FileScanRDD",
                    "output":[

                    ],
                    "params":{

                    },
                    "extra":{

                    }
                }
            ]
jinmu0410 commented 1 year ago

@cerveada

jinmu0410 commented 1 year ago

read output and extra is empty

cerveada commented 1 year ago

This is caused by using RDD on your side. Spark doesn't support RDD lineage, it doesn't provide a lot of information about lineage when RDDs are used. Stick to DataFrames, Datasets and SQL if possible.

We still try to get a lineage when possible, even with RDDs, but it will be limited, so you won't see any attribute data and another metadata.

cerveada commented 1 year ago

See #33