处理mysql表中大量重复数据

Mysql对表中大量重复数据进行去重。

表结构(如果原表中没有索引,需要添加一个二级索引):

1
2
3
4
CREATE TABLE `md` (
  `md` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL,
  KEY `md` (`md`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;

创建一个存储过程,批量插入数据。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
CREATE DEFINER=`root`@`%` PROCEDURE `BatchInsertMD5`(IN totalRecords INT)
BEGIN
    DECLARE currentBatch INT DEFAULT 1;
    DECLARE currentRecord INT DEFAULT 1;

    -- 禁用索引以加速插入
    ALTER TABLE md DISABLE KEYS;

    WHILE currentRecord <= totalRecords DO
        INSERT INTO md (md)
        SELECT MD5(CONV(currentRecord + (@rownum := @rownum + 1), 10, 16))
        FROM (
            SELECT @rownum := 0
        ) r1,
        (
            SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9 UNION ALL SELECT 10
        ) r2,
        (
            SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9 UNION ALL SELECT 10
        ) r3,
        (
            SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9 UNION ALL SELECT 10
        ) r4,
        (
            SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9 UNION ALL SELECT 10
        ) r5
        LIMIT 10000; -- 每次插入1万条记录

        SET currentRecord = currentRecord + 10000;
    END WHILE;

    -- 重新启用索引
    ALTER TABLE md ENABLE KEYS;
END

调用存储过程:CALL BatchInsertMD5(10000000);

数据去重

  • 创建临时表。
  • 将原表数据去重插入到临时表。
  • 移除原表,临时表重命名为原表名称。或者清空原表数据,将临时表数据插入到原表。
1
2
3
4
CREATE TABLE md_tmp LIKE md;

# 再执行之前,可以使用explain查看下执行计划,是否使用索引。
insert into md_tmp(md) select md from md group by md;