暂时记录,改天再整理
import re
import os
import pandas as pd
from requests import get
from docx import Document
import win32com.client as win
import subprocess
# dir2 = ‘C:/Users/User/Documents/gzzw/‘
# names = os.listdir(dir2)
# data = {}
# word = win.Dispatch(‘Word.Application‘)
# for (i, n) in enumerate(names):
# subprocess.call([‘soffice‘, ‘--headless‘, ‘--convert-to‘, ‘docx‘, ‘--outdir‘, dir2 + str(i) + ‘.docx‘, dir + n])
# doc = Document(docx=dir2 + n)
# table = doc.tables[1]
# row = table.rows[0]
# row1 = table.rows[1]
# for k, v in zip(row.cells, row1.cells):
# if i == 0:
# data[k.text] = [v.text]
# else:
# data[k.text].append(v.text)
# gs = re.match(r‘.*_(.*)_.*‘, n)
# dn.append(gs.group(1))
# os.rename(dir + n, dir + str(i) + ‘.doc‘)
# doc = word.Documents.Open(dir + n)
# doc.SaveAs(dir2 + str(i) + ‘.docx‘, FileFormat=12)
# table = doc.Tables(2)
# for j in range(table.Columns.Count):
# print(table.Cell(Row=1, Column=i + 1).Range.Text)
# label.append(table.Cell(Row=1, Column=i + 1).Range.Text.encode(‘utf8‘))
# dn.append(table.Cell(Row=2, Column=i + 1).Range.Text.encode(‘utf8‘))
# word.Quit()
# sheel = pd.DataFrame(data)
# sheel.to_excel(dir2 + ‘statics.xlsx‘, index=False, encoding=‘utf8‘)
一些参考链接:
1.https://code.activestate.com/recipes/279003-converting-word-documents-to-text/
2.https://stackoverflow.com/questions/1468099/python-win32-extensions-documentation
4.https://stackoverflow.com/questions/38468442/multiple-doc-to-docx-file-conversion-using-python
5.https://www.jianshu.com/p/4fa504c720c1
原文:https://www.cnblogs.com/darkchii/p/12051950.html