数据库小技巧-使用开窗函数矫正数据库指定列部分列值重复的数据
需求描述
目前有某表的某列部分值重复,需要批量矫正该列数据,确保该列分组内不会出现重复值。
解决思路
-- 创建个临时表
create table t_tmp_20250428(
c_bh varchar(32), -- 主键
c_bh_aj varchar(32), -- 主表外键,分组条件,不同分组内可以重复,同一组内c_mc不可以重复
c_mc varchar(300), -- 有重复值的列
n_xh int -- 分组内的序号,不一定有值
);-- 插入模拟数据,用于模拟问题场景
insert into t_tmp_20250428 values (replace(public.uuid_generate_v4()::text, '-', ''), 'aj1', '封皮', 1);
insert into t_tmp_20250428 values (replace(public.uuid_generate_v4()::text, '-', ''), 'aj1', '立案通知书', 2);
insert into t_tmp_20250428 values (replace(public.uuid_generate_v4()::text, '-', ''), 'aj1', '判决书', 3);
insert into t_tmp_20250428 values (replace(public.uuid_generate_v4()::text, '-', ''), 'aj1', '再审申请书', 4);
insert into t_tmp_20250428 values (replace(public.uuid_generate_v4()::text, '-', ''), 'aj2', '封皮', 1);
insert into t_tmp_20250428 values (replace(public.uuid_generate_v4()::text, '-', ''), 'aj2', '立案通知书', 2);
insert into t_tmp_20250428 values (replace(public.uuid_generate_v4()::text, '-', ''), 'aj2', '判决书', 3);
insert into t_tmp_20250428 values (replace(public.uuid_generate_v4()::text, '-', ''), 'aj2', '再审申请书', 4);
insert into t_tmp_20250428 values (replace(public.uuid_generate_v4()::text, '-', ''), 'aj2', '再审申请书', 5);
insert into t_tmp_20250428 values (replace(public.uuid_generate_v4()::text, '-', ''), 'aj2', '再审申请书', 6);
insert into t_tmp_20250428 values (replace(public.uuid_generate_v4()::text, '-', ''), 'aj3', '封皮', 1);
insert into t_tmp_20250428 values (replace(public.uuid_generate_v4()::text, '-', ''), 'aj3', '立案通知书', 2);
insert into t_tmp_20250428 values (replace(public.uuid_generate_v4()::text, '-', ''), 'aj3', '判决书', 3);
insert into t_tmp_20250428 values (replace(public.uuid_generate_v4()::text, '-', ''), 'aj3', '判决书', 4);
insert into t_tmp_20250428 values (replace(public.uuid_generate_v4()::text, '-', ''), 'aj3', '判决书', 5)-- 更新前可以先查询一下要更新的结果是否符合预期(数据量大记得加条件,不然容易卡死)
select c_bh, c_bh_aj, c_mc, c_mc || n_xh_fenzu, n_xh from (select c_bh, c_bh_aj, c_mc, row_number() over(partition by c_bh_aj, c_mc order by n_xh) as n_xh_fenzu,n_xh from t_tmp_20250428) res
where n_xh_fenzu > 1;-- 上面查询脚本确认没问题时,先备份业务库,再执行更新脚本(若有其他子分组条件,可以继续在partition by 后面追加分组条件,如 partition by c_bh_aj, c_bh_ml, c_mc)
update t_tmp_20250428 set c_mc = tmp.c_mc || tmp.n_xh_fenzu from (select * from (select c_bh, c_bh_aj, c_mc, row_number() over(partition by c_bh_aj, c_mc order by n_xh) as n_xh_fenzu from t_tmp_20250428) res
where n_xh_fenzu > 1) tmp where t_tmp_20250428.c_bh = tmp.c_bh;-- 更新完毕后再确认一下结果是否符合预期(数据量大记得加条件,不然容易卡死)
select * from t_tmp_20250428 order by c_bh_aj, n_xh;-- 用完记得删除临时表(如果仅仅是用于演示,建表时可以用 create temp t_tmp_20250428这种方法来创建临时表,会话结束后不需要手工删除临时表,数据库会自动清理)
drop table t_tmp_20250428;