# -*- coding: UTF-8 -*- ‘‘‘ 作者:zhangdongyu 简介:把word文档转为markdown文档 原理: 1. 根据Word文字颜色判断是否为标题、行代码 2. 根据Word文字是否加粗、倾斜进行加粗和倾斜判定 3. 根据Word“项目符号/编号”判断是否为列表 4. 根据Word 1x1的表格判断是否为块代码 5. 当然也可以根据字体大小、是否倾斜、或者其它指标判断是否是标题、行代码、块代码,自己按需修改源码实现 注意:文档中的图片无法处理 参考资料: [python-docx官方文档](https://python-docx.readthedocs.io/en/latest/index.html) [第105天: Python 操作 Word](http://www.ityouknow.com/python/2019/12/31/python-word-105.html) [python-docx处理word文档](https://zhuanlan.zhihu.com/p/61340025) [Python顺序读取word文档中的文本与表格](https://blog.csdn.net/qq_39600166/article/details/101537368) 配置文件示例(json格式,Windows下需要两个反斜杠): 配置说明: head_1_color:一级标题颜色(7030A0:紫色) head_2_color:二级标题颜色(0070C0:蓝色) head_3_color:三级标题颜色(00B050:绿色) head_4_color:四级标题颜色(C55A11:橙色) head_5_color:五级标题颜色(FF66CC:粉色) line_code_color:行代码颜色(C00000:红色) head_1_sharp_num:一级标题几个#号,后续标题#个数依次递增 mode:1:处理单个docx文档,2:处理目录下的所有docx文档 src_path:docx文档路径(模式1),或目录路径(模式2) 模式1:e.g. D:\\\\下载\\\\test.docx 模式2:e.g. D:\\\\下载\\\\test save_path:markdown文件保存路径 模式1:e.g. D:\\\\下载\\\\test.md 模式2:e.g. D:\\\\下载\\\\test_md 单docx转markdown: {"head_1_color":"7030A0", "head_2_color":"0070C0", "head_3_color":"00B050", "head_4_color":"C55A11", "head_5_color":"FF66CC", "line_code_color":"C00000", "head_1_sharp_num":2, "mode":1, "src_path":"D:\\\\下载\\\\test.docx", "save_path":"D:\\\\下载\\\\test.md"} 一个目录下的所有docx转markdown: {"head_1_color":"7030A0", "head_2_color":"0070C0", "head_3_color":"00B050", "head_4_color":"C55A11", "head_5_color":"FF66CC", "line_code_color":"C00000", "head_1_sharp_num":2, "mode":2, "src_path":"D:\\\\下载\\\\test", "save_path":"D:\\\\下载\\\\test_md"} ‘‘‘ import os import shutil import glob import json import docx from docx.document import Document from docx.oxml.table import CT_Tbl from docx.oxml.text.paragraph import CT_P from docx.table import _Cell, Table from docx.text.paragraph import Paragraph from docx.shared import RGBColor def iter_block_items(parent): ‘‘‘ Yield each paragraph and table child within *parent*, in document order. Each returned value is an instance of either Table or Paragraph. *parent* would most commonly be a reference to a main Document object, but also works for a _Cell object, which itself can contain paragraphs and tables. ‘‘‘ if isinstance(parent, Document): parent_elm = parent.element.body elif isinstance(parent, _Cell): parent_elm = parent._tc else: raise ValueError("something‘s not right") for child in parent_elm.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, parent) elif isinstance(child, CT_Tbl): yield Table(child, parent) def write_paragraph(block, f_md): text = ‘‘ # 通过颜色判断段落是否为标题(最多支持五级标题) flag = ‘‘ if len(block.runs) > 0: rgb = block.runs[0].font.color.rgb if rgb == RGBColor(eval(‘0x‘ + h1c[0:2]), eval(‘0x‘ + h1c[2:4]), eval(‘0x‘ + h1c[4:6])): # 一级标题 flag = ‘head 1‘ elif rgb == RGBColor(eval(‘0x‘ + h2c[0:2]), eval(‘0x‘ + h2c[2:4]), eval(‘0x‘ + h2c[4:6])): # 二级标题 flag = ‘head 2‘ elif rgb == RGBColor(eval(‘0x‘ + h3c[0:2]), eval(‘0x‘ + h3c[2:4]), eval(‘0x‘ + h3c[4:6])): # 三级标题 flag = ‘head 3‘ elif rgb == RGBColor(eval(‘0x‘ + h4c[0:2]), eval(‘0x‘ + h4c[2:4]), eval(‘0x‘ + h4c[4:6])): # 四级标题 flag = ‘head 4‘ elif rgb == RGBColor(eval(‘0x‘ + h5c[0:2]), eval(‘0x‘ + h5c[2:4]), eval(‘0x‘ + h5c[4:6])): # 五级标题 flag = ‘head 5‘ # 拼接段落文本 if flag != ‘‘: for run in block.runs: text += run.text else: for run in block.runs: if run.font.color.rgb == RGBColor(eval(‘0x‘ + lcc[0:2]), eval(‘0x‘ + lcc[2:4]), eval(‘0x‘ + lcc[4:6])): # 行代码 text += ‘`‘ + run.text + ‘`‘ elif run.font.cs_bold: # 加粗 text += ‘**‘ + run.text + ‘** ‘ elif run.font.cs_italic: # 斜体 text += ‘*‘ + run.text + ‘* ‘ else: text += run.text type = block.style.name if type == ‘Normal‘: # 普通文本 pass elif type == ‘List Paragraph‘: # 项目符号/编号 text = ‘- ‘ + text else: pass # 文本写入文件 if flag == ‘head 1‘: f_md.write(‘#‘ * (h1n + 0) + ‘ ‘ + text + ‘\n‘) elif flag == ‘head 2‘: f_md.write(‘#‘ * (h1n + 1) + ‘ ‘ + text + ‘\n‘) elif flag == ‘head 3‘: f_md.write(‘#‘ * (h1n + 2) + ‘ ‘ + text + ‘\n‘) elif flag == ‘head 4‘: f_md.write(‘#‘ * (h1n + 3) + ‘ ‘ + text + ‘\n‘) elif flag == ‘head 5‘: f_md.write(‘#‘ * (h1n + 4) + ‘ ‘ + text + ‘\n‘) else: f_md.write(text + ‘\n‘) def write_table(block, f_md): f_md.write(‘```‘ + ‘\n‘) for i in range(len(block.rows)): f_md.write(block.cell(i, 0).text + ‘\n‘) f_md.write(‘```‘ + ‘\n‘) def docx_2_markdown(docx_path, md_save_path): f_md = open(file=md_save_path, mode=‘wt‘, encoding=‘utf-8‘) doc = docx.Document(docx_path) for block in iter_block_items(doc): if isinstance(block, Paragraph): write_paragraph(block, f_md) elif isinstance(block, Table): write_table(block, f_md) f_md.close() if __name__ == ‘__main__‘: msg = ‘‘‘ 作者:zhangdongyu 简介:把word文档转为markdown文档 原理: 1. 根据Word文字颜色判断是否为标题、行代码 2. 根据Word文字是否加粗、倾斜进行加粗和倾斜判定 3. 根据Word“项目符号/编号”判断是否为列表 4. 根据Word 1x1的表格判断是否为块代码 5. 当然也可以根据字体大小、是否倾斜、或者其它指标判断是否是标题、行代码、块代码,自己按需修改源码实现 注意:文档中的图片无法处理 参考资料: [python-docx官方文档](https://python-docx.readthedocs.io/en/latest/index.html) [第105天:Python操作Word](http://www.ityouknow.com/python/2019/12/31/python-word-105.html) [python-docx处理word文档](https://zhuanlan.zhihu.com/p/61340025) [Python顺序读取word文档中的文本与表格](https://blog.csdn.net/qq_39600166/article/details/101537368) 配置文件示例(json格式,Windows下需要两个反斜杠): 配置说明: head_1_color:一级标题颜色(7030A0:紫色) head_2_color:二级标题颜色(0070C0:蓝色) head_3_color:三级标题颜色(00B050:绿色) head_4_color:四级标题颜色(C55A11:橙色) head_5_color:五级标题颜色(FF66CC:粉色) line_code_color:行代码颜色(C00000:红色) head_1_sharp_num:一级标题几个#号,后续标题#个数依次递增 mode:1:处理单个docx文档,2:处理目录下的所有docx文档 src_path:docx文档路径(模式1),或目录路径(模式2) 模式1:e.g. D:\\\\下载\\\\test.docx 模式2:e.g. D:\\下载\\test save_path:markdown文件保存路径 模式1:e.g. D:\\\\下载\\\\test.md 模式2:e.g. D:\\下载\\test_md 单docx转markdown: {"head_1_color":"7030A0", "head_2_color":"0070C0", "head_3_color":"00B050", "head_4_color":"C55A11", "head_5_color":"FF66CC", "line_code_color":"C00000", "head_1_sharp_num":2, "mode":1, "src_path":"D:\\\\下载\\\\test.docx", "save_path":"D:\\\\下载\\\\test.md"} 一个目录下的所有docx转markdown: {"head_1_color":"7030A0", "head_2_color":"0070C0", "head_3_color":"00B050", "head_4_color":"C55A11", "head_5_color":"FF66CC", "line_code_color":"C00000", "head_1_sharp_num":2, "mode":2, "src_path":"D:\\\\下载\\\\test", "save_path":"D:\\\\下载\\\\test_md"} ‘‘‘ print(msg) config = input("input config json content (copy and modify from above config examples):\n") config.strip(‘ ‘) config = json.loads(config) h1c = config["head_1_color"] h2c = config["head_2_color"] h3c = config["head_3_color"] h4c = config["head_4_color"] h5c = config["head_5_color"] lcc = config["line_code_color"] h1n = config["head_1_sharp_num"] mode = config["mode"] src_path = config["src_path"] save_path = config["save_path"] if mode == 1: docx_2_markdown(src_path, save_path) if mode == 2: # 创建目录 if os.path.exists(save_path): shutil.rmtree(save_path) os.makedirs(save_path) # 列出所有docx文档 docx_files = glob.glob(os.path.join(src_path, ‘*.docx‘)) # docx 2 markdown for docx_file in docx_files: md_file = os.path.join(save_path, os.path.splitext(os.path.basename(docx_file))[0] + ‘.md‘) docx_2_markdown(docx_file, md_file) print(‘\nDone!‘) input(‘Press any key to exit!‘)
原文:https://www.cnblogs.com/sinicheveen/p/14682862.html