StarRocks / starrocks

StarRocks, a Linux Foundation project, is a next-generation sub-second MPP OLAP database for full analytics scenarios, including multi-dimensional analytics, real-time analytics, and ad-hoc queries.
https://starrocks.io
Apache License 2.0
8.67k stars 1.75k forks source link

Avg gives unstable answer diffrent from sum-divde-count #10849

Closed satanson closed 2 years ago

satanson commented 2 years ago

Steps to reproduce the behavior (Required)

DROP DATABASE IF EXISTS test_unstable_avg_db;
CREATE DATABASE test_unstable_avg_db;
USE test_unstable_avg_db;
DROP TABLE IF EXISTS t0;
DROP TABLE if exists t0;

CREATE TABLE if not exists t0
(
c0 INT NOT NULL,
c1 BIGINT NOT NULL
) ENGINE=OLAP
DUPLICATE KEY(`c0`)
COMMENT "OLAP"
DISTRIBUTED BY HASH(`c0`) BUCKETS 1
PROPERTIES(
"replication_num" = "1",
"in_memory" = "false",
"storage_format" = "default"
)
;
INSERT INTO t0
  (c0, c1)
VALUES
  ('0', '3571453256412945136'),
  ('1', '3066296858290086972'),
  ('2', '6376223999357030618'),
  ('3', '1872993654427111130'),
  ('4', '1905864233472284015'),
  ('5', '1199755371691544311'),
  ('6', '695763509027106409'),
  ('7', '87767084134126106'),
  ('8', '1992061837565954702'),
  ('9', '5155451685745971182'),
  ('10', '1312977452967956980'),
  ('11', '1299513856938906650'),
  ('12', '1205508937023260488'),
  ('13', '1765440901467523508'),
  ('14', '1921618638191915426'),
  ('15', '3263059343005619428'),
  ('16', '2215008845765732423'),
  ('17', '2723647151320734392'),
  ('18', '5546698936034911195'),
  ('19', '1519570882294071121'),
  ('20', '2119031171525722728'),
  ('21', '4723281440172665455'),
  ('22', '49649984148424252'),
  ('23', '920077881207290098'),
  ('24', '1362685358422574818'),
  ('25', '7099009613042818729'),
  ('26', '238653747256075113'),
  ('27', '3898297402743873926'),
  ('28', '3793766453539288'),
  ('29', '5656290229356981950'),
  ('30', '811015790014103465'),
  ('31', '5274329162203998963'),
  ('32', '49908938566042781'),
  ('33', '1068834016224896639'),
  ('34', '8748880454409415210'),
  ('35', '4047669081442439510'),
  ('36', '6706558524242127826'),
  ('37', '3515476952713956260'),
  ('38', '2521037622515883299'),
  ('39', '4263817966827230851'),
  ('40', '195475102353453464'),
  ('41', '2272906004808429144'),
  ('42', '9191522838632301700'),
  ('43', '59632367954134141'),
  ('44', '1742307209858385516'),
  ('45', '2439753115248275202'),
  ('46', '2019342782234551258'),
  ('47', '4432623954792850727'),
  ('48', '-7472660808541073623'),
  ('49', '2146467332738560139'),
  ('50', '572390616826723858'),
  ('51', '5279316111913403075'),
  ('52', '4214254316486467293'),
  ('53', '551433637963328712'),
  ('54', '551890330557656112'),
  ('55', '1827524453188788859'),
  ('56', '5908322079468582771'),
  ('57', '1261663346086380929'),
  ('58', '625589168251512431'),
  ('59', '4180487676621384780'),
  ('60', '3665027693839446412'),
  ('61', '3778220379242966441'),
  ('62', '7132953318216279281'),
  ('63', '1120738770918043763'),
  ('64', '-2584091082850489102'),
  ('65', '1271192322541226760'),
  ('66', '5381836759983261594'),
  ('67', '881537155169069074'),
  ('68', '2246898882898266393'),
  ('69', '6037697811342282368'),
  ('70', '8018910780113280906'),
  ('71', '6017579438246434584'),
  ('72', '6601414107367801882'),
  ('73', '777455898775850601'),
  ('74', '54466692334254064'),
  ('75', '4713752853350911124'),
  ('76', '7928508322485003080'),
  ('77', '1767338146276330761'),
  ('78', '2353711149239498994'),
  ('79', '94914828762882215'),
  ('80', '2223360105983759119'),
  ('81', '5584624339671869'),
  ('82', '3391654315887331862'),
  ('83', '4473371730342957076'),
  ('84', '-2569487732954954038'),
  ('85', '5333068384032733946'),
  ('86', '635589744809970417'),
  ('87', '76370493499813659'),
  ('88', '1025726063370004262'),
  ('89', '3525436745117371049'),
  ('90', '9172192558127985552'),
  ('91', '4017776837816903903'),
  ('92', '1329512530707854350'),
  ('93', '5446780328175192164'),
  ('94', '9090292707275701335'),
  ('95', '1954181812098943778'),
  ('96', '1393854143965927178'),
  ('97', '8164323767110225273'),
  ('98', '846324147514728688'),
  ('99', '1938318735997615814'),
  ('100', '6363577493992144110'),
  ('101', '4852785559183263955'),
  ('102', '4451109995577229046'),
  ('103', '2167898104212404297'),
  ('104', '1402635353163530901'),
  ('105', '2843627479245717578'),
  ('106', '5315166085478459676'),
  ('107', '8536225679928564680'),
  ('108', '1788076424813194986'),
  ('109', '1746937386109595315'),
  ('110', '2068502044538944559'),
  ('111', '499537701960793014'),
  ('112', '8323218400513703902'),
  ('113', '8323218400513703902'),
  ('114', '3533608209030189680'),
  ('115', '3533608209030189680'),
  ('116', '284883796156137832'),
  ('117', '2210833348506715599'),
  ('118', '5435078704862130754'),
  ('119', '8793767858791813295'),
  ('120', '1856980110867629130'),
  ('121', '8379739767754107052'),
  ('122', '4329166957024324406'),
  ('123', '1386679607306507416'),
  ('124', '519657463260869106'),
  ('125', '1220959403602338280'),
  ('126', '5790844100702256843'),
  ('127', '1554871861387282325'),
  ('128', '1554871861387282325'),
  ('129', '-2345784350228198886'),
  ('130', '9032494951086762163'),
  ('131', '38627743929737680'),
  ('132', '7901858313768697234'),
  ('133', '1632244946084803570'),
  ('134', '8132998850579355856'),
  ('135', '8132998850579355856'),
  ('136', '1838892462735114953'),
  ('137', '2263698254776028907'),
  ('138', '183689729540317353'),
  ('139', '710842514166401130'),
  ('140', '2104535578414626428'),
  ('141', '1130621399593768113'),
  ('142', '1130621399593768113'),
  ('143', '631110936380790247'),
  ('144', '1833669608629538280'),
  ('145', '728738257124066720'),
  ('146', '2073671784235732066'),
  ('147', '752780236628215972'),
  ('148', '1930621841290685500'),
  ('149', '8477093313321496820'),
  ('150', '237284546126356686'),
  ('151', '941450389504677021'),
  ('152', '834552580560637540'),
  ('153', '2280195317068710779'),
  ('154', '6331277145610099960'),
  ('155', '5005514881912256503'),
  ('156', '5237868628549997891'),
  ('157', '3963915849554510566'),
  ('158', '770410901443275405'),
  ('159', '5232287513787245951');;
select sum(c1), count(c1), sum(c1)/count(c1), avg(c1) from t0

Expected behavior (Required)

select sum(c1), count(c1), sum(c1)/count(c1), avg(c1) from t0

sum(c1)/count(c1) == avg(c1)

Real behavior (Required)

select sum(c1), count(c1), sum(c1)/count(c1), avg(c1) from t0

+---------------------+-----------+-------------------------+------------------------+
| sum(c1)             | count(c1) | (sum(c1)) / (count(c1)) | avg(c1)                |
+---------------------+-----------+-------------------------+------------------------+
| 1113019276342802297 |       160 |        6956370477142514 | 3.0045522824549443e+18 |
+---------------------+-----------+-------------------------+------------------------+
1 row in set (0.02 sec)

mysql> select sum(c1), count(c1), sum(c1)/count(c1), avg(c1) from t0;
+---------------------+-----------+-------------------------+------------------------+
| sum(c1)             | count(c1) | (sum(c1)) / (count(c1)) | avg(c1)                |
+---------------------+-----------+-------------------------+------------------------+
| 1113019276342802297 |       160 |        6956370477142514 | 3.0045522824549443e+18 |
+---------------------+-----------+-------------------------+------------------------+

when number of rows increase, avg gives a unstable answer. we use tpcds_100g; Q1: select avg(ws_promo_sk), sum(ws_promo_sk)/count(ws_promo_sk) from web_sales;

+--------------------+-------------------------------------------+
| avg(ws_promo_sk)   | (sum(ws_promo_sk)) / (count(ws_promo_sk)) |
+--------------------+-------------------------------------------+
| 500.52712020160664 |                        500.52712020160664 |
+--------------------+-------------------------------------------+

Q2: select avg(cast(ws_promo_sk4294967296 as bigint)), sum(cast(ws_promo_sk4294967296 as bigint))/count(ws_promo_sk) from web_sales;

mysql> select avg(cast(ws_promo_sk*4294967296 as bigint)), sum(cast(ws_promo_sk*4294967296 as bigint))/count(ws_promo_sk) from web_sales;
+-------------------------------------------------+--------------------------------------------------------------------------+
| avg(CAST((ws_promo_sk * 4294967296) AS BIGINT)) | (sum(CAST((ws_promo_sk * 4294967296) AS BIGINT))) / (count(ws_promo_sk)) |
+-------------------------------------------------+--------------------------------------------------------------------------+
|                              2149747612026.9614 |                                                         97123440636.1343 |
+-------------------------------------------------+--------------------------------------------------------------------------+

Q3: select avg(cast(ws_promo_sk429496729600000000000 as largeint)), sum(cast(ws_promo_sk429496729600000000000 as largeint))/count(ws_promo_sk) from web_sales;

+--------------------------------------------------------------+---------------------------------------------------------------------------------------+
| avg(CAST((ws_promo_sk * 429496729600000000000) AS LARGEINT)) | (sum(CAST((ws_promo_sk * 429496729600000000000) AS LARGEINT))) / (count(ws_promo_sk)) |
+--------------------------------------------------------------+---------------------------------------------------------------------------------------+
|                                       2.1497476120269635e+23 |                                                                2.1497476120269614e+23 |
+--------------------------------------------------------------+---------------------------------------------------------------------------------------+
1 row in set (0.10 sec)

mysql> select avg(cast(ws_promo_sk*429496729600000000000 as largeint)), sum(cast(ws_promo_sk*429496729600000000000 as largeint))/count(ws_promo_sk) from web_sales;
+--------------------------------------------------------------+---------------------------------------------------------------------------------------+
| avg(CAST((ws_promo_sk * 429496729600000000000) AS LARGEINT)) | (sum(CAST((ws_promo_sk * 429496729600000000000) AS LARGEINT))) / (count(ws_promo_sk)) |
+--------------------------------------------------------------+---------------------------------------------------------------------------------------+
|                                       2.1497476120269638e+23 |                                                                2.1497476120269614e+23 |
+--------------------------------------------------------------+---------------------------------------------------------------------------------------+
1 row in set (0.11 sec)

StarRocks version (Required)

Pslydhh commented 2 years ago

It caused by overflow of bigint,

mysql> select sum(cast(c1 as largeint)), count(c1), sum(cast(c1 as largeint))/count(c1), avg(c1) from t0;
+---------------------------+-----------+-------------------------------------------+-----------------------+
| sum(CAST(c1 AS LARGEINT)) | count(c1) | (sum(CAST(c1 AS LARGEINT))) / (count(c1)) | avg(c1)               |
+---------------------------+-----------+-------------------------------------------+-----------------------+
| 480728365192791144313     |       160 |                     3.004552282454945e+18 | 3.004552282454945e+18 |
+---------------------------+-----------+-------------------------------------------+-----------------------+