Postgresql源码(143)统计信息基础知识(带实例)
概念与总结
-
高频值(Most Common Values, MCV)
- 存储在 most_common_vals 中。
- 每个高频值的频率通过 most_common_freqs 单独记录(例如 0.010966667 等)。
- MCV 用于优化等值查询(如 poid = 33),直接通过频率计算选择率,无需依赖直方图。
-
直方图(Histogram Bounds)
- 仅覆盖非高频值(未出现在MCV中的值)的分布。
- 用于优化范围查询(如poid BETWEEN 1050 AND 1200),通过分桶插值估算选择率。
-
correlation:物理与逻辑顺序相关性
- correlation 表示列值的 物理存储顺序 与 逻辑顺序(升序/降序) 的线性相关性。
- 取值范围为 [-1, 1]
- 1:完全正相关(物理顺序与逻辑顺序一致,如自增主键)。
- -1:完全负相关(物理顺序与逻辑顺序相反)
- 0:无相关性(随机存储)。
实例一:3000万高频重复值(1-999)
CREATE TABLE ii (poid INT NOT NULL, value NUMERIC, status int);
-- 分布均匀的话,采样会非常准确。
insert into ii select t.i%1000, t.i, 0 from generate_series(1,30000000) t(i);
CREATE INDEX idx_n_poid ON ii(poid);
analyze ii;
统计信息
postgres=# select * from pg_stats where tablename='ii' and attname='poid';
-[ RECORD 1 ]----------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
schemaname | public
tablename | ii
attname | poid
inherited | f
null_frac | 0
avg_width | 4
n_distinct | 1000
most_common_vals | {334,580,881,431,2,33,79,112}
most_common_freqs | {0.0016,0.0015333333,0.0015333333,0.0015,0.0014666667,0.0014666667,0.0014666667,0.0014666667}
histogram_bounds | {0,10,20,30,41,52,61,71,82,91,99,109,119,129,139,149,159,168,178,189,197,207,217,226,236,246,256,266,276,286,296,306,315,325,336,346,356,365,374,386,396,406,417,427,438,448,458,468,478,488,498,508,518,528,537,549,558,569,578,590,599,609,619,629,639,649,659,669,679,687,697,707,719,729,740,750,760,771,781,791,800,810,820,829,839,849,859,869,880,891,901,910,921,931,941,950,961,970,980,990,999}
correlation | 0.002622716
most_common_elems |
most_common_elem_freqs |
elem_count_histogram |postgres=# select count(distinct poid) from ii;count
-------1000
(1 row)postgres=# select count(1) from ii;count
----------30000000
(1 row)
- 默认采样行数:300 × default_statistics_target(默认default_statistics_target=100,即采样30000行)
- n_distinct = 1000
- 唯一值个数与真实值相等,分布均匀的情况下采样最准。
- most_common_vals = {334,580,881,431,2,33,79,112}
- 数据均匀分布,随机采3万MCV的值也比较平均
- histogram_bounds = {0,10,20,30,41,52,61,71,82,91,99,109,119,129,139,149,159,168,178,189,197,207,217,226,236,246,256,266,276,286,296,306,315,325,336,346,356,365,374,386,396,406,417,427,438,448,458,468,478,488,498,508,518,528,537,549,558,569,578,590,599,609,619,629,639,649,659,669,679,687,697,707,719,729,740,750,760,771,781,791,800,810,820,829,839,849,859,869,880,891,901,910,921,931,941,950,961,970,980,990,999}
- 直方图看起来比较平均,因为MCV比较少。
实例二:2800万高频重复值(1-99) + 200万个低频重复值(1000-1899)
CREATE TABLE id (poid INT NOT NULL, value NUMERIC, status int);
INSERT INTO id SELECT t%100, t, 0 FROM generate_series(1, 28000000) t;
INSERT INTO id SELECT 1000 + (t%900), t, 0 FROM generate_series(28000001, 30000000) t;
CREATE INDEX idx_id_poid ON id(poid);
analyze id;
统计信息
postgres=# select * from pg_stats where tablename='id' and attname='poid';
-[ RECORD 1 ]----------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
schemaname | public
tablename | id
attname | poid
inherited | f
null_frac | 0
avg_width | 4
n_distinct | 901
most_common_vals | {33,61,96,2,27,17,41,85,40,59,62,65,93,6,15,46,47,71,54,48,88,44,67,75,79,95,18,21,45,68,82,7,72,91,52,49,94,11,34,22,23,25,38,5,74,30,43,55,4,8,35,1,53,56,92,12,39,51,64,86,89,20,87,0,24,99,10,32,19,57,80,81,31,50,70,90,9,66,73,3,78,97,69,77,58,37,42,28,76,98,84,13,63,14,60,26,16,36,29,83}
most_common_freqs | {0.010966667,0.010933333,0.010866666,0.010533334,0.0104,0.0102,0.0102,0.0101666665,0.0101,0.010066667,0.010066667,0.009966667,0.009966667,0.009833333,0.009833333,0.009833333,0.009833333,0.009833333,0.0098,0.009766666,0.009766666,0.009733333,0.009733333,0.0097,0.0097,0.0097,0.009666666,0.009666666,0.009666666,0.009666666,0.009666666,0.009633333,0.009633333,0.009633333,0.009566667,0.009533334,0.009533334,0.0095,0.0095,0.009466667,0.009433334,0.009433334,0.009433334,0.0094,0.0094,0.009366667,0.009366667,0.009366667,0.009333333,0.009333333,0.009333333,0.0093,0.0093,0.0093,0.0093,0.009266667,0.009266667,0.009266667,0.009266667,0.009266667,0.009266667,0.009233333,0.009233333,0.0092,0.009166666,0.009166666,0.009133333,0.009133333,0.0090333335,0.009,0.008966667,0.008966667,0.008933334,0.008933334,0.008933334,0.008933334,0.0089,0.0089,0.0089,0.008866667,0.008866667,0.008866667,0.0088,0.008733333,0.0087,0.008666666,0.008633333,0.0086,0.0086,0.0086,0.008566666,0.008533333,0.008533333,0.0085,0.0085,0.008333334,0.0083,0.008266667,0.0082,0.0079}
histogram_bounds | {1000,1005,1019,1030,1036,1047,1056,1064,1072,1080,1089,1100,1110,1122,1132,1140,1149,1158,1168,1178,1188,1195,1208,1215,1226,1238,1245,1255,1264,1275,1286,1292,1299,1309,1316,1323,1332,1339,1347,1355,1368,1378,1390,1399,1407,1418,1430,1438,1448,1453,1462,1472,1483,1492,1507,1516,1524,1536,1543,1549,1556,1562,1572,1578,1586,1593,1604,1611,1621,1628,1635,1646,1654,1663,1673,1683,1693,1704,1709,1718,1727,1734,1745,1751,1758,1766,1776,1785,1791,1798,1806,1814,1825,1834,1841,1851,1861,1872,1883,1893,1899}
correlation | 0.19235307
most_common_elems |
most_common_elem_freqs |
elem_count_histogram |postgres=# select count(distinct poid) from ii;count
-------1000
(1 row)postgres=# select count(1) from ii;count
----------30000000
(1 row)
- n_distinct = 901
- 评估的唯一值个数比真实的少了99个,因为数据分布不均匀了
- 采样是30000个,但采集到1-99的概率:采集到1000-1899的概率=2800:200=14:1
- 所以这里会比真实值少一些。
- most_common_vals = {33,61,96,2,27,17,41,85,40,59,62,65,93,6,15,46,47,71,54,48,88,44,67,75,79,95,18,21,45,68,82,7,72,91,52,49,94,11,34,22,23,25,38,5,74,30,43,55,4,8,35,1,53,56,92,12,39,51,64,86,89,20,87,0,24,99,10,32,19,57,80,81,31,50,70,90,9,66,73,3,78,97,69,77,58,37,42,28,76,98,84,13,63,14,60,26,16,36,29,83}
- 为什么MCV的个数比实例一多很多?
- histogram_bounds = {1000,1005,1019,1030,1036,1047,1056,1064,1072,1080,1089,1100,1110,1122,1132,1140,1149,1158,1168,1178,1188,1195,1208,1215,1226,1238,1245,1255,1264,1275,1286,1292,1299,1309,1316,1323,1332,1339,1347,1355,1368,1378,1390,1399,1407,1418,1430,1438,1448,1453,1462,1472,1483,1492,1507,1516,1524,1536,1543,1549,1556,1562,1572,1578,1586,1593,1604,1611,1621,1628,1635,1646,1654,1663,1673,1683,1693,1704,1709,1718,1727,1734,1745,1751,1758,1766,1776,1785,1791,1798,1806,1814,1825,1834,1841,1851,1861,1872,1883,1893,1899}
- 直方图显示了非MCV值的分布情况,符合预期。
实例一的MCV只有8个值,实例二MCV个数远大于实例一,差异的原因?
-
实例一(均匀分布):poid 列通过 t.i%1000 生成,每个值重复约 30,000 次(30,000,000 行 / 1000 唯一值)。由于分布均匀,MCV之间的频率差异极小。PostgreSQL只会选择 《频率显著高于平均值》的值作为 MCV。理想情况下应该一个都选不出来,但由于随机采样,这几个值是因为在采样中略微高频,所以被选出来了。所以这里数量少的原因是,数据太平均了,选出来的比较少。
-
实例二(非均匀分布):poid 列分为 100 个高频值(重复 280,000 次)和 900 个低频值(重复约 2,222 次)。高频值的频率(约 0.01)远高于低频值(约 0.0002),PostgreSQL 将 《所有高频值》存入 most_common_vals。由于默认 default_statistics_target=100,系统会尽量填满 MCV 列表。
实例三:2999万高频重复值(1-49) + 1万个低频重复值(1000-1949)
-- 50个值高频重复值(2999万)+950个低频唯一值(1万)
CREATE TABLE iee (poid INT NOT NULL, value NUMERIC, status int);
INSERT INTO iee SELECT t%50, t, 0 FROM generate_series(1, 29990000) t;
INSERT INTO iee SELECT 1000 + (t%950), t, 0 FROM generate_series(29990001, 30000000) t;
CREATE INDEX idx_iee_poid ON iee(poid);
analyze iee;
统计信息
postgres=# select * from pg_stats where tablename='iee' and attname='poid';
-[ RECORD 1 ]----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
schemaname | public
tablename | iee
attname | poid
inherited | f
null_frac | 0
avg_width | 4
n_distinct | 56
most_common_vals | {26,3,41,1,20,38,15,37,8,22,46,27,32,48,29,7,43,30,44,12,16,42,5,25,40,24,34,39,6,49,0,11,28,21,18,14,17,33,47,35,31,2,4,10,45,9,19,23,13,36}
most_common_freqs | {0.021866666,0.0215,0.021233333,0.021066668,0.021,0.020966666,0.020766666,0.020766666,0.0206,0.0206,0.0206,0.020433333,0.020366667,0.020333333,0.0203,0.020266667,0.020266667,0.0202,0.020166667,0.020133333,0.0201,0.020066667,0.020033333,0.020033333,0.02,0.019933334,0.019933334,0.019933334,0.019866666,0.019866666,0.0198,0.0198,0.019766666,0.019633334,0.0196,0.019566666,0.019533332,0.019533332,0.0195,0.019466667,0.019433333,0.019366667,0.0193,0.019266667,0.0191,0.019033333,0.018933333,0.018833334,0.0187,0.018433332}
histogram_bounds | {1029,1243,1267,1378,1419,1797}
correlation | 0.026665932
most_common_elems |
most_common_elem_freqs |
elem_count_histogram |postgres=# select count(distinct poid) from ir;count
-------1000
(1 row)postgres=# select count(1) from ir;count
----------30000000
(1 row)
- n_distinct = 56
- 唯一值的偏差更大了
- 因为50个重复值占据了2999万数据,采样30000条大概率都会采样到这50个数,所以这里n_distinct只有56个。
- most_common_vals = {26,3,41,1,20,38,15,37,8,22,46,27,32,48,29,7,43,30,44,12,16,42,5,25,40,24,34,39,6,49,0,11,28,21,18,14,17,33,47,35,31,2,4,10,45,9,19,23,13,36}
- histogram_bounds = {1029,1243,1267,1378,1419,1797}
- 直方图的值少了很多,因为采样到最后1万的概率会很低。
附其他测试数据
drop table mm;
CREATE TABLE mm (poid int primary key, edata date, ooid int);
CREATE INDEX idx_mm_1 ON mm(edata);
insert into mm select t.i, '2025-01-01 10:00:00',100 from generate_series(1,200) t(i);
insert into mm select t.i, '2025-01-01 11:00:00',130 from generate_series(201,400) t(i);
insert into mm select t.i, '2025-01-01 12:00:00',100 from generate_series(401,600) t(i);
insert into mm select t.i, '2025-01-01 13:00:00',130 from generate_series(601,800) t(i);
analyze mm;CREATE TABLE ii (poid INT NOT NULL, value NUMERIC, status int);
-- 分布均匀的话,采样会非常准确。
insert into ii select t.i%1000, t.i, 0 from generate_series(1,30000000) t(i);
CREATE INDEX idx_n_poid ON ii(poid);
analyze ii;postgres=# select * from pg_stats where tablename='ii' and attname='poid';
-[ RECORD 1 ]----------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
schemaname | public
tablename | ii
attname | poid
inherited | f
null_frac | 0
avg_width | 4
n_distinct | 1000
most_common_vals | {334,580,881,431,2,33,79,112}
most_common_freqs | {0.0016,0.0015333333,0.0015333333,0.0015,0.0014666667,0.0014666667,0.0014666667,0.0014666667}
histogram_bounds | {0,10,20,30,41,52,61,71,82,91,99,109,119,129,139,149,159,168,178,189,197,207,217,226,236,246,256,266,276,286,296,306,315,325,336,346,356,365,374,386,396,406,417,427,438,448,458,468,478,488,498,508,518,528,537,549,558,569,578,590,599,609,619,629,639,649,659,669,679,687,697,707,719,729,740,750,760,771,781,791,800,810,820,829,839,849,859,869,880,891,901,910,921,931,941,950,961,970,980,990,999}
correlation | 0.002622716
most_common_elems |
most_common_elem_freqs |
elem_count_histogram |postgres=# select count(distinct poid) from ii;count
-------1000
(1 row)postgres=# select count(1) from ii;count
----------30000000
(1 row)-- 100个值高频重复值(2800万)+900个低频重复值(200万)
CREATE TABLE id (poid INT NOT NULL, value NUMERIC, status int);
INSERT INTO id SELECT t%100, t, 0 FROM generate_series(1, 28000000) t;
INSERT INTO id SELECT 1000 + (t%900), t, 0 FROM generate_series(28000001, 30000000) t;
CREATE INDEX idx_id_poid ON id(poid);
analyze id;postgres=# select * from pg_stats where tablename='id' and attname='poid';
-[ RECORD 1 ]----------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
schemaname | public
tablename | id
attname | poid
inherited | f
null_frac | 0
avg_width | 4
n_distinct | 901
most_common_vals | {33,61,96,2,27,17,41,85,40,59,62,65,93,6,15,46,47,71,54,48,88,44,67,75,79,95,18,21,45,68,82,7,72,91,52,49,94,11,34,22,23,25,38,5,74,30,43,55,4,8,35,1,53,56,92,12,39,51,64,86,89,20,87,0,24,99,10,32,19,57,80,81,31,50,70,90,9,66,73,3,78,97,69,77,58,37,42,28,76,98,84,13,63,14,60,26,16,36,29,83}
most_common_freqs | {0.010966667,0.010933333,0.010866666,0.010533334,0.0104,0.0102,0.0102,0.0101666665,0.0101,0.010066667,0.010066667,0.009966667,0.009966667,0.009833333,0.009833333,0.009833333,0.009833333,0.009833333,0.0098,0.009766666,0.009766666,0.009733333,0.009733333,0.0097,0.0097,0.0097,0.009666666,0.009666666,0.009666666,0.009666666,0.009666666,0.009633333,0.009633333,0.009633333,0.009566667,0.009533334,0.009533334,0.0095,0.0095,0.009466667,0.009433334,0.009433334,0.009433334,0.0094,0.0094,0.009366667,0.009366667,0.009366667,0.009333333,0.009333333,0.009333333,0.0093,0.0093,0.0093,0.0093,0.009266667,0.009266667,0.009266667,0.009266667,0.009266667,0.009266667,0.009233333,0.009233333,0.0092,0.009166666,0.009166666,0.009133333,0.009133333,0.0090333335,0.009,0.008966667,0.008966667,0.008933334,0.008933334,0.008933334,0.008933334,0.0089,0.0089,0.0089,0.008866667,0.008866667,0.008866667,0.0088,0.008733333,0.0087,0.008666666,0.008633333,0.0086,0.0086,0.0086,0.008566666,0.008533333,0.008533333,0.0085,0.0085,0.008333334,0.0083,0.008266667,0.0082,0.0079}
histogram_bounds | {1000,1005,1019,1030,1036,1047,1056,1064,1072,1080,1089,1100,1110,1122,1132,1140,1149,1158,1168,1178,1188,1195,1208,1215,1226,1238,1245,1255,1264,1275,1286,1292,1299,1309,1316,1323,1332,1339,1347,1355,1368,1378,1390,1399,1407,1418,1430,1438,1448,1453,1462,1472,1483,1492,1507,1516,1524,1536,1543,1549,1556,1562,1572,1578,1586,1593,1604,1611,1621,1628,1635,1646,1654,1663,1673,1683,1693,1704,1709,1718,1727,1734,1745,1751,1758,1766,1776,1785,1791,1798,1806,1814,1825,1834,1841,1851,1861,1872,1883,1893,1899}
correlation | 0.19235307
most_common_elems |
most_common_elem_freqs |
elem_count_histogram |postgres=# select count(distinct poid) from ii;count
-------1000
(1 row)postgres=# select count(1) from ii;count
----------30000000
(1 row)-- 50个值高频重复值(2970万)+950个低频唯一值(30万)
CREATE TABLE ie (poid INT NOT NULL, value NUMERIC, status int);
INSERT INTO ie SELECT t%50, t, 0 FROM generate_series(1, 29700000) t;
INSERT INTO ie SELECT 1000 + (t%950), t, 0 FROM generate_series(29700001, 30000000) t;
CREATE INDEX idx_ie_poid ON ie(poid);
analyze ie;postgres=# select * from pg_stats where tablename='ie' and attname='poid';
-[ RECORD 1 ]----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
schemaname | public
tablename | ie
attname | poid
inherited | f
null_frac | 0
avg_width | 4
n_distinct | 318
most_common_vals | {20,18,44,10,13,9,24,19,29,47,12,30,32,49,48,5,25,2,39,6,7,14,36,43,26,35,16,42,38,31,0,46,4,8,23,11,15,37,41,34,17,45,22,28,21,40,1,3,33,27}
most_common_freqs | {0.021433333,0.021133333,0.021066668,0.021,0.020933334,0.0209,0.020866666,0.020833334,0.020766666,0.0207,0.0206,0.0205,0.0205,0.0204,0.020333333,0.020266667,0.020266667,0.020166667,0.020166667,0.020133333,0.020133333,0.020133333,0.02,0.02,0.019866666,0.019766666,0.019733334,0.019733334,0.019666666,0.019566666,0.019533332,0.019533332,0.019366667,0.019366667,0.019366667,0.0193,0.019233333,0.019166667,0.0191,0.0189,0.018833334,0.018833334,0.018733334,0.0187,0.0186,0.0185,0.0184,0.0184,0.0183,0.0176}
histogram_bounds | {1003,1015,1024,1030,1040,1051,1082,1090,1101,1114,1118,1132,1137,1149,1164,1169,1173,1183,1191,1197,1204,1209,1225,1234,1250,1257,1272,1282,1288,1293,1300,1303,1312,1325,1332,1347,1361,1377,1392,1409,1415,1419,1427,1431,1440,1444,1448,1455,1462,1482,1490,1505,1514,1524,1529,1535,1539,1544,1558,1567,1575,1581,1584,1592,1597,1613,1620,1625,1639,1653,1669,1673,1681,1688,1698,1699,1707,1721,1727,1737,1755,1763,1768,1786,1795,1804,1810,1818,1822,1840,1856,1859,1869,1882,1890,1898,1902,1904,1917,1925,1943}
correlation | 0.057023432
most_common_elems |
most_common_elem_freqs |
elem_count_histogram |postgres=# select count(distinct poid) from ie;count
-------1000
(1 row)postgres=# select count(1) from ie;count
----------30000000
(1 row)postgres=# select * from ie limit 10;poid | value | status
------+-------+--------1 | 1 | 02 | 2 | 03 | 3 | 04 | 4 | 05 | 5 | 06 | 6 | 07 | 7 | 08 | 8 | 09 | 9 | 010 | 10 | 0
(10 rows)-- 50个值高频重复值(2970万)+950个低频唯一值(30万)+ 随机分布CREATE TABLE ir (poid INT NOT NULL, value NUMERIC, status int);
INSERT INTO ir SELECT t%50, t, 0 FROM generate_series(1, 29700000) t ORDER BY random();
INSERT INTO ir SELECT 1000 + (t%950), t, 0 FROM generate_series(29700001, 30000000) t ORDER BY random();
CREATE INDEX idx_ir_poid ON ir(poid);
analyze ir;postgres=# select * from pg_stats where tablename='ir' and attname='poid';
-[ RECORD 1 ]----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
schemaname | public
tablename | ir
attname | poid
inherited | f
null_frac | 0
avg_width | 4
n_distinct | 330
most_common_vals | {9,29,14,34,25,46,17,39,21,1,18,45,49,27,38,41,42,36,30,11,35,16,47,2,15,40,0,31,24,37,19,44,48,23,3,4,13,6,32,33,43,12,26,10,28,8,22,5,20,7}
most_common_freqs | {0.021433333,0.021233333,0.021,0.0209,0.020833334,0.0208,0.020766666,0.020733334,0.020666666,0.020433333,0.020433333,0.020366667,0.020366667,0.0203,0.0203,0.0202,0.0202,0.020166667,0.020133333,0.0201,0.020066667,0.020033333,0.020033333,0.02,0.019833334,0.0198,0.019733334,0.0197,0.0196,0.0196,0.019566666,0.019566666,0.019566666,0.019466667,0.019433333,0.019366667,0.0193,0.019233333,0.019233333,0.019133333,0.0191,0.019033333,0.018833334,0.018733334,0.0187,0.018533334,0.018466666,0.018366667,0.018333333,0.017566666}
histogram_bounds | {1001,1006,1014,1017,1024,1040,1046,1058,1063,1073,1086,1098,1111,1117,1135,1142,1146,1158,1165,1175,1180,1189,1199,1204,1215,1232,1247,1250,1262,1273,1281,1286,1296,1299,1301,1314,1322,1325,1332,1336,1343,1353,1360,1374,1385,1404,1418,1434,1442,1456,1466,1476,1481,1487,1502,1511,1515,1518,1539,1550,1562,1573,1580,1583,1603,1621,1642,1659,1666,1675,1682,1690,1693,1703,1709,1726,1744,1751,1758,1762,1773,1778,1782,1787,1791,1803,1810,1830,1835,1853,1874,1880,1882,1895,1897,1907,1916,1927,1933,1938,1949}
correlation | 0.057593007
most_common_elems |
most_common_elem_freqs |
elem_count_histogram |postgres=# select count(distinct poid) from ir;count
-------1000
(1 row)postgres=# select count(1) from ir;count
----------30000000
(1 row)postgres=# select * from ir limit 10;poid | value | status
------+----------+--------0 | 26165250 | 043 | 21564993 | 02 | 4975852 | 047 | 28519047 | 037 | 12294337 | 03 | 15888653 | 018 | 18510218 | 00 | 16988450 | 042 | 24585492 | 032 | 19939732 | 0
(10 rows)-- 50个值高频重复值(2999万)+950个低频唯一值(1万)
CREATE TABLE iee (poid INT NOT NULL, value NUMERIC, status int);
INSERT INTO iee SELECT t%50, t, 0 FROM generate_series(1, 29990000) t;
INSERT INTO iee SELECT 1000 + (t%950), t, 0 FROM generate_series(29990001, 30000000) t;
CREATE INDEX idx_iee_poid ON iee(poid);
analyze iee;postgres=# select * from pg_stats where tablename='iee' and attname='poid';
-[ RECORD 1 ]----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
schemaname | public
tablename | iee
attname | poid
inherited | f
null_frac | 0
avg_width | 4
n_distinct | 56
most_common_vals | {26,3,41,1,20,38,15,37,8,22,46,27,32,48,29,7,43,30,44,12,16,42,5,25,40,24,34,39,6,49,0,11,28,21,18,14,17,33,47,35,31,2,4,10,45,9,19,23,13,36}
most_common_freqs | {0.021866666,0.0215,0.021233333,0.021066668,0.021,0.020966666,0.020766666,0.020766666,0.0206,0.0206,0.0206,0.020433333,0.020366667,0.020333333,0.0203,0.020266667,0.020266667,0.0202,0.020166667,0.020133333,0.0201,0.020066667,0.020033333,0.020033333,0.02,0.019933334,0.019933334,0.019933334,0.019866666,0.019866666,0.0198,0.0198,0.019766666,0.019633334,0.0196,0.019566666,0.019533332,0.019533332,0.0195,0.019466667,0.019433333,0.019366667,0.0193,0.019266667,0.0191,0.019033333,0.018933333,0.018833334,0.0187,0.018433332}
histogram_bounds | {1029,1243,1267,1378,1419,1797}
correlation | 0.026665932
most_common_elems |
most_common_elem_freqs |
elem_count_histogram |postgres=# select count(distinct poid) from ir;count
-------1000
(1 row)postgres=# select count(1) from ir;count
----------30000000
(1 row)