format_manual_v2.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 格式化系统操作手册 - 改进版本,支持图片复制
  5. """
  6. from docx import Document
  7. from docx.shared import Pt, Inches
  8. from docx.enum.text import WD_ALIGN_PARAGRAPH
  9. from docx.oxml.ns import qn
  10. import re
  11. from io import BytesIO
  12. def set_font(run, font_name='宋体', font_size=12, bold=False):
  13. """设置字体"""
  14. run.font.name = font_name
  15. run.font.size = Pt(font_size)
  16. run.font.bold = bold
  17. run._element.rPr.rFonts.set(qn('w:eastAsia'), font_name)
  18. def is_heading_level_1(text):
  19. """判断是否为一级标题"""
  20. if not text:
  21. return False
  22. # 匹配: 1、系统登录, 2、数据管理 等
  23. return bool(re.match(r'^\d+、', text)) or text == '系统登录'
  24. def is_heading_level_2(text):
  25. """判断是否为二级标题"""
  26. if not text or len(text) < 3:
  27. return False
  28. # 匹配: 1.1, 1.2, 2.1 等
  29. match = re.match(r'^\d+\.\d+\s', text)
  30. return bool(match)
  31. def is_heading_level_3(text):
  32. """判断是否为三级标题"""
  33. if not text or len(text) < 5:
  34. return False
  35. # 匹配: 2.1.1, 2.1.2 等
  36. match = re.match(r'^\d+\.\d+\.\d+\s', text)
  37. return bool(match)
  38. def copy_paragraph_with_images(source_para, target_doc):
  39. """复制段落,包括文本和图片"""
  40. text = source_para.text.strip()
  41. # 创建新段落
  42. new_para = target_doc.add_paragraph()
  43. # 根据文本类型设置格式
  44. if is_heading_level_1(text):
  45. run = new_para.add_run(text)
  46. set_font(run, font_name='黑体', font_size=16, bold=True)
  47. new_para.paragraph_format.space_before = Pt(12)
  48. new_para.paragraph_format.space_after = Pt(6)
  49. elif is_heading_level_2(text):
  50. run = new_para.add_run(text)
  51. set_font(run, font_name='黑体', font_size=14, bold=True)
  52. new_para.paragraph_format.space_before = Pt(10)
  53. new_para.paragraph_format.space_after = Pt(5)
  54. elif is_heading_level_3(text):
  55. run = new_para.add_run(text)
  56. set_font(run, font_name='黑体', font_size=13, bold=True)
  57. new_para.paragraph_format.space_before = Pt(8)
  58. new_para.paragraph_format.space_after = Pt(4)
  59. else:
  60. # 普通段落
  61. if text:
  62. run = new_para.add_run(text)
  63. set_font(run, font_name='宋体', font_size=11)
  64. new_para.paragraph_format.space_after = Pt(3)
  65. new_para.paragraph_format.line_spacing = 1.15
  66. # 检查并复制图片
  67. if source_para._element.xpath('.//w:drawing'):
  68. img_para = target_doc.add_paragraph()
  69. img_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
  70. for run in source_para.runs:
  71. if hasattr(run, '_element') and run._element.xpath('.//a:blip'):
  72. # 获取图片的 rId
  73. blip = run._element.xpath('.//a:blip')[0]
  74. rId = blip.get(qn('r:embed'))
  75. # 从源文档获取图片
  76. image_part = source_para.part.related_parts[rId]
  77. image_bytes = image_part.blob
  78. # 添加到新文档
  79. try:
  80. # 获取原始图片尺寸
  81. inline = run._element.xpath('.//wp:inline')[0]
  82. extent = inline.xpath('.//wp:extent')[0]
  83. cx = int(extent.get('cx'))
  84. cy = int(extent.get('cy'))
  85. # 转换为英寸
  86. width = Inches(cx / 914400)
  87. height = Inches(cy / 914400)
  88. # 添加图片 - 使用 BytesIO 包装字节数据
  89. img_run = img_para.add_run()
  90. image_stream = BytesIO(image_bytes)
  91. img_run.add_picture(image_stream, width=width, height=height)
  92. except Exception as e:
  93. print(f"复制图片时出错: {e}")
  94. # 如果失败,尝试不指定尺寸
  95. try:
  96. img_run = img_para.add_run()
  97. image_stream = BytesIO(image_bytes)
  98. img_run.add_picture(image_stream)
  99. except:
  100. pass
  101. def format_document(input_path, output_path):
  102. """格式化文档"""
  103. print("正在读取文档...")
  104. doc = Document(input_path)
  105. print("创建新文档...")
  106. new_doc = Document()
  107. # 设置页边距
  108. for section in new_doc.sections:
  109. section.top_margin = Inches(1)
  110. section.bottom_margin = Inches(1)
  111. section.left_margin = Inches(1.25)
  112. section.right_margin = Inches(1.25)
  113. # ===== 第一页:标题页 =====
  114. print("创建标题页...")
  115. # 添加空行使标题居中
  116. for _ in range(8):
  117. new_doc.add_paragraph()
  118. title_para = new_doc.add_paragraph()
  119. title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
  120. title_run = title_para.add_run('系统操作手册')
  121. set_font(title_run, font_name='黑体', font_size=28, bold=True)
  122. # 添加分页符
  123. new_doc.add_page_break()
  124. # ===== 第二页:目录 =====
  125. print("创建目录...")
  126. toc_title = new_doc.add_paragraph()
  127. toc_title.alignment = WD_ALIGN_PARAGRAPH.CENTER
  128. toc_run = toc_title.add_run('目 录')
  129. set_font(toc_run, font_name='黑体', font_size=18, bold=True)
  130. new_doc.add_paragraph()
  131. # 目录内容
  132. toc_items = [
  133. ('1、系统登录', 0),
  134. (' 1.1 系统登录', 1),
  135. (' 1.2 系统首页', 1),
  136. ('2、数据管理', 0),
  137. (' 2.1 检查列表', 1),
  138. (' 2.1.1 影像上传', 2),
  139. (' 2.1.2 查看影像数据详情', 2),
  140. (' 2.2 部位列表', 1),
  141. (' 2.3 患者信息', 1),
  142. ('3、质控管理', 0),
  143. (' 3.1 全量质控', 1),
  144. (' 3.2 质控任务', 1),
  145. (' 3.3 质控结果', 1),
  146. (' 3.4 部位结果', 1),
  147. ('4、字典管理', 0),
  148. (' 4.1 质控标准', 1),
  149. (' 4.2 质控因子', 1),
  150. (' 4.3 检查项目', 1),
  151. ('5、系统管理', 0),
  152. (' 5.1 用户管理', 1),
  153. (' 5.2 对接设置', 1),
  154. ]
  155. for item_text, level in toc_items:
  156. toc_para = new_doc.add_paragraph()
  157. toc_run = toc_para.add_run(item_text)
  158. set_font(toc_run, font_name='宋体', font_size=12)
  159. toc_para.paragraph_format.space_after = Pt(2)
  160. # 添加分页符
  161. new_doc.add_page_break()
  162. # ===== 第三页开始:正文内容 =====
  163. print("复制正文内容...")
  164. content_started = False
  165. para_count = 0
  166. for para in doc.paragraphs:
  167. text = para.text.strip()
  168. # 跳过前面的空段落
  169. if not content_started and not text:
  170. continue
  171. if text:
  172. content_started = True
  173. if content_started:
  174. copy_paragraph_with_images(para, new_doc)
  175. para_count += 1
  176. if para_count % 10 == 0:
  177. print(f"已处理 {para_count} 个段落...")
  178. # 保存新文档
  179. print("保存文档...")
  180. new_doc.save(output_path)
  181. print(f"\n✓ 文档格式化完成!")
  182. print(f"✓ 已保存到: {output_path}")
  183. print(f"✓ 共处理 {para_count} 个段落")
  184. if __name__ == '__main__':
  185. input_file = '/Users/geng/Desktop/系统操作手册.docx'
  186. output_file = '/Users/geng/Desktop/系统操作手册_格式化.docx'
  187. try:
  188. format_document(input_file, output_file)
  189. except Exception as e:
  190. print(f"\n✗ 错误: {e}")
  191. import traceback
  192. traceback.print_exc()