cleanup_duplicate_studies.sql 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. -- =============================================
  2. -- 清理重复的 study_instance_uid 记录
  3. -- 保留最早创建的那条,删除其他重复的
  4. -- 日期:2025-12-30
  5. -- =============================================
  6. USE qconline;
  7. -- =============================================
  8. -- 方案1:查看重复数据(安全,不删除)
  9. -- =============================================
  10. SELECT '=== 重复数据报告 ===' AS '';
  11. SELECT
  12. institution_id,
  13. study_instance_uid,
  14. COUNT(*) as duplicate_count,
  15. GROUP_CONCAT(study_id ORDER BY create_time) as all_study_ids,
  16. GROUP_CONCAT(id ORDER BY create_time) as all_ids,
  17. MIN(create_time) as first_created,
  18. MAX(create_time) as last_created,
  19. MIN(study_id) as first_study_id,
  20. MIN(id) as first_id
  21. FROM study_info
  22. GROUP BY institution_id, study_instance_uid
  23. HAVING COUNT(*) > 1;
  24. -- =============================================
  25. -- 方案2:标记要删除的重复记录(安全,不实际删除)
  26. -- =============================================
  27. SELECT '=== 标记要删除的重复记录 ===' AS '';
  28. -- 创建临时表存储要保留的记录ID
  29. DROP TEMPORARY TABLE IF EXISTS temp_keep_ids;
  30. CREATE TEMPORARY TABLE temp_keep_ids AS
  31. SELECT
  32. MIN(id) as keep_id
  33. FROM study_info
  34. GROUP BY institution_id, study_instance_uid
  35. HAVING COUNT(*) > 1;
  36. -- 查看将被删除的记录
  37. SELECT
  38. s.id,
  39. s.institution_id,
  40. s.study_instance_uid,
  41. s.study_id,
  42. s.create_time
  43. FROM study_info s
  44. WHERE s.id NOT IN (SELECT keep_id FROM temp_keep_ids)
  45. AND EXISTS (
  46. SELECT 1 FROM study_info s2
  47. WHERE s2.institution_id = s.institution_id
  48. AND s2.study_instance_uid = s.study_instance_uid
  49. GROUP BY s2.institution_id, s2.study_instance_uid
  50. HAVING COUNT(*) > 1
  51. )
  52. ORDER BY s.institution_id, s.study_instance_uid, s.create_time;
  53. -- =============================================
  54. -- 方案3:备份重复记录到历史表(推荐)
  55. -- =============================================
  56. SELECT '=== 备份重复记录到历史表 ===' AS '';
  57. -- 创建备份表(如果不存在)
  58. CREATE TABLE IF NOT EXISTS study_info_backup_20251230 (
  59. LIKE study_info
  60. );
  61. -- 插入重复记录到备份表
  62. INSERT INTO study_info_backup_20251230
  63. SELECT *
  64. FROM study_info
  65. WHERE id NOT IN (SELECT keep_id FROM temp_keep_ids)
  66. AND EXISTS (
  67. SELECT 1 FROM study_info s2
  68. WHERE s2.institution_id = study_info.institution_id
  69. AND s2.study_instance_uid = study_info.study_instance_uid
  70. GROUP BY s2.institution_id, s2.study_instance_uid
  71. HAVING COUNT(*) > 1
  72. );
  73. SELECT CONCAT('已备份 ', ROW_COUNT(), ' 条重复记录到 study_info_backup_20251230') AS '';
  74. -- 查看备份的记录数
  75. SELECT COUNT(*) as backed_up_count FROM study_info_backup_20251230;
  76. -- =============================================
  77. -- 方案4:删除重复记录(谨慎使用)
  78. -- =============================================
  79. -- ⚠️ 警告:此操作会永久删除数据,请先备份!
  80. -- ⚠️ 建议先执行方案1-3,确认无误后再执行本方案
  81. SELECT '=== 删除重复记录(谨慎使用)===' AS '';
  82. -- 取消下面的注释来执行删除操作
  83. /*
  84. -- 删除重复记录(保留最早的那条)
  85. DELETE FROM study_info
  86. WHERE id NOT IN (SELECT keep_id FROM temp_keep_ids)
  87. AND EXISTS (
  88. SELECT 1 FROM study_info s2
  89. WHERE s2.institution_id = study_info.institution_id
  90. AND s2.study_instance_uid = study_info.study_instance_uid
  91. GROUP BY s2.institution_id, s2.study_instance_uid
  92. HAVING COUNT(*) > 1
  93. );
  94. SELECT CONCAT('已删除 ', ROW_COUNT(), ' 条重复记录') AS '';
  95. */
  96. -- =============================================
  97. -- 方案5:智能合并重复记录(高级)
  98. -- =============================================
  99. SELECT '=== 智能合并重复记录说明 ===' AS '';
  100. /*
  101. 智能合并策略:
  102. 1. 保留最早创建的记录作为主记录
  103. 2. 合并统计信息:
  104. - series_count = MAX(series_count)
  105. - image_count = SUM(image_count)
  106. 3. 合并文件列表(如果 dicom_file_path 不同)
  107. 4. 更新 update_time 为当前时间
  108. 5. 删除其他重复记录
  109. ⚠️ 此操作较复杂,建议手动编写存储过程执行
  110. */
  111. -- =============================================
  112. -- 验证清理结果
  113. -- =============================================
  114. SELECT '=== 验证是否还有重复 ===' AS '';
  115. SELECT
  116. institution_id,
  117. study_instance_uid,
  118. COUNT(*) as count
  119. FROM study_info
  120. GROUP BY institution_id, study_instance_uid
  121. HAVING COUNT(*) > 1;
  122. -- 如果上面的查询结果为空,说明清理成功
  123. -- =============================================
  124. -- 清理建议
  125. -- =============================================
  126. /*
  127. 执行顺序建议:
  128. 1. 先执行方案1,查看重复数据情况
  129. 2. 执行方案2,查看哪些记录会被删除
  130. 3. 执行方案3,备份重复记录
  131. 4. 检查备份表数据,确认可以删除
  132. 5. 执行方案4,删除重复记录
  133. 6. 再次执行方案1的验证查询,确认无重复
  134. 7. 执行 add_study_unique_constraint.sql 添加唯一约束
  135. 注意事项:
  136. - 删除前务必备份!
  137. - 建议在测试环境先执行
  138. - 可以保留备份表一段时间,确认无问题后再删除
  139. - 删除操作会影响相关的 series_info 记录,需要级联处理
  140. */