Mysql对表中大量重复数据进行去重。
表结构(如果原表中没有索引,需要添加一个二级索引):
1
2
3
4
|
CREATE TABLE `md` (
`md` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL,
KEY `md` (`md`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
|
创建一个存储过程,批量插入数据。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
|
CREATE DEFINER=`root`@`%` PROCEDURE `BatchInsertMD5`(IN totalRecords INT)
BEGIN
DECLARE currentBatch INT DEFAULT 1;
DECLARE currentRecord INT DEFAULT 1;
-- 禁用索引以加速插入
ALTER TABLE md DISABLE KEYS;
WHILE currentRecord <= totalRecords DO
INSERT INTO md (md)
SELECT MD5(CONV(currentRecord + (@rownum := @rownum + 1), 10, 16))
FROM (
SELECT @rownum := 0
) r1,
(
SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9 UNION ALL SELECT 10
) r2,
(
SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9 UNION ALL SELECT 10
) r3,
(
SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9 UNION ALL SELECT 10
) r4,
(
SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9 UNION ALL SELECT 10
) r5
LIMIT 10000; -- 每次插入1万条记录
SET currentRecord = currentRecord + 10000;
END WHILE;
-- 重新启用索引
ALTER TABLE md ENABLE KEYS;
END
|
调用存储过程:CALL BatchInsertMD5(10000000);
数据去重
- 创建临时表。
- 将原表数据去重插入到临时表。
- 移除原表,临时表重命名为原表名称。或者清空原表数据,将临时表数据插入到原表。
1
2
3
4
|
CREATE TABLE md_tmp LIKE md;
# 再执行之前,可以使用explain查看下执行计划,是否使用索引。
insert into md_tmp(md) select md from md group by md;
|