StarRocks / demo

Apache License 2.0
79 stars 55 forks source link

taobao data loaded on Hudi shows different results than taobao data loaded on StarRocks internal format #52

Closed alberttwong closed 5 months ago

alberttwong commented 5 months ago

https://forum.starrocks.io/t/retail-ecommerce-funnel-analysis-demo-with-1-million-members-and-87-million-record-dataset-using-starrocks/269

VS

mysql> select count(*) from user_behavior;
+----------+
| count(*) |
+----------+
| 86953525 |
+----------+
1 row in set (0.14 sec)

mysql> with tmp1 as (
    ->   with tmp as (
    ->     select
    ->       t.level as level,
    ->       count(UserID) as res
    ->     from
    ->       (
    ->         select
    ->           UserID,
    ->           window_funnel(
    ->             18000,
    ->             `Timestamp`,
    ->             0,
    ->             [BehaviorType = 'pv' ,
    ->             BehaviorType = 'cart',
    ->             BehaviorType = 'buy' ]
    ->           ) as level
    ->         from
    ->           user_behavior
    ->         where `Timestamp` >= '2017-12-02 00:00:00'
    ->             and `Timestamp` <= '2017-12-02 23:59:59'
    ->         group by
    ->           UserID
    ->       ) as t
    ->     where
    ->       t.level > 0
    ->     group by
    ->       t.level
    ->   )
    ->   select
    ->     tmp.level,
    ->     sum(tmp.res) over (
      order by
        tmp.level rows between current row
        and unbounded following
    ) as retention
  from
    tmp
)
select
  tmp1.level,
  tmp1.retention,
  last_value(tmp1.retention) over(
    order by
      tmp1.level rows between current row
      and 1 following
  )/ tmp1.retention as retention_ratio
from
  tmp1;    ->       order by
    ->         tmp.level rows between current row
    ->         and unbounded following
    ->     ) as retention
    ->   from
    ->     tmp
    -> )
    -> select
    ->   tmp1.level,
    ->   tmp1.retention,
    ->   last_value(tmp1.retention) over(
    ->     order by
    ->       tmp1.level rows between current row
    ->       and 1 following
    ->   )/ tmp1.retention as retention_ratio
    -> from
    ->   tmp1;
+-------+-----------+---------------------+
| level | retention | retention_ratio     |
+-------+-----------+---------------------+
|     1 |    913314 | 0.34725078122091635 |
|     2 |    317149 | 0.23266981765668504 |
|     3 |     73791 |                   1 |
+-------+-----------+---------------------+
3 rows in set (1.94 sec)

mysql> with tmp1 as (
    ->   with tmp as (
    ->     select
    ->       ItemID,
    ->       t.level as level,
    ->       count(UserID) as res
    ->     from
    ->       (
    ->         select
    ->           ItemID,
    ->           UserID,
    ->           window_funnel(
    ->             1800,
    ->             timestamp,
    ->             0,
    ->             [BehaviorType = 'pv',
    ->             BehaviorType ='buy' ]
    ->           ) as level
    ->         from
    ->           user_behavior
    ->         where timestamp >= '2017-12-02 00:00:00'
    ->             and timestamp <= '2017-12-02 23:59:59'
    ->         group by
    ->           ItemID,
    ->           UserID
    ->       ) as t
    ->     where
    ->       t.level > 0
    ->     group by
    ->       t.ItemID,
    ->       t.level
    ->   )
    ->   select
    ->     tmp.ItemID,
    ->     tmp.level,
    ->     sum(tmp.res) over (
    ->       partition by tmp.ItemID
    ->       order by
    ->         tmp.level rows between current row
    ->         and unbounded following
    ->     ) as retention
    ->   from
    ->     tmp
    -> )
    -> select
    ->   tmp1.ItemID,
    ->   tmp1.level,
    ->   tmp1.retention / last_value(tmp1.retention) over(
    ->     partition by tmp1.ItemID
    ->     order by
    ->       tmp1.level desc rows between current row
    ->       and 1 following
    ->   ) as retention_ratio
    -> from
    ->   tmp1
    -> order by
    ->   level desc,
    ->   retention_ratio
    -> limit
    ->   10;
+---------+-------+-----------------------+
| ItemID  | level | retention_ratio       |
+---------+-------+-----------------------+
|   59883 |     2 | 0.0003616636528028933 |
|  394978 |     2 | 0.0006357279084551812 |
| 1164931 |     2 | 0.0006648936170212766 |
| 4622270 |     2 | 0.0007692307692307692 |
|  812879 |     2 | 0.0009121313469139556 |
| 1783990 |     2 | 0.0009132420091324201 |
| 3847054 |     2 |  0.000925925925925926 |
| 2742138 |     2 | 0.0009881422924901185 |
|  530918 |     2 | 0.0010193679918450561 |
|  600756 |     2 | 0.0010319917440660474 |
+---------+-------+-----------------------+
10 rows in set (3.67 sec)

mysql> select
    ->   log.BehaviorType,
    ->   count(log.BehaviorType)
    -> from
    ->   (
    ->     select
    ->       ItemID,
    ->       UserID,
    ->       window_funnel(
    ->         1800,
    ->         timestamp,
    ->         0,
    ->         [BehaviorType = 'pv' ,
    ->         BehaviorType = 'buy' ]
    ->       ) as level
    ->     from
    ->       user_behavior
    ->     where timestamp >= '2017-12-02 00:00:00'
    ->         and timestamp <= '2017-12-02 23:59:59'
    ->     group by
    ->       ItemID,
    ->       UserID
    ->   ) as list
    ->   left join (
    ->     select
    ->       UserID,
    ->       array_agg(BehaviorType) as BehaviorType
    ->     from
    ->       user_behavior
    ->     where
    ->       ItemID = 3563468
    ->       and timestamp >= '2017-12-02 00:00:00'
    ->       and timestamp <= '2017-12-02 23:59:59'
    ->     group by
    ->       UserID
    ->   ) as log on list.UserID = log.UserID
    -> where
    ->   list.ItemID = 3563468
    ->   and list.level = 1
    -> group by
    ->   log.BehaviorType
    -> order by
    ->   count(BehaviorType) desc;
+--------------------------------------+-------------------------+
| BehaviorType                         | count(log.BehaviorType) |
+--------------------------------------+-------------------------+
| ["pv"]                               |                    1589 |
| ["pv","pv"]                          |                      52 |
| ["pv","pv","pv"]                     |                      10 |
| ["cart","pv"]                        |                       8 |
| ["cart","pv","pv"]                   |                       6 |
| ["fav","pv"]                         |                       6 |
| ["fav","pv","pv"]                    |                       3 |
| ["pv","pv","pv","pv"]                |                       2 |
| ["cart","pv","pv","pv"]              |                       2 |
| ["cart","pv","pv","pv","pv"]         |                       1 |
| ["pv","pv","pv","pv","pv","pv","pv"] |                       1 |
| ["fav","pv","pv","pv","pv"]          |                       1 |
| ["fav","pv","pv","pv","cart"]        |                       1 |
| ["pv","cart","pv"]                   |                       1 |
| ["pv","pv","cart"]                   |                       1 |
+--------------------------------------+-------------------------+
15 rows in set (3.10 sec)

mysql>
alberttwong commented 5 months ago

I cannot duplicate the results in https://github.com/StarRocks/starrocks/discussions/26075