python操作PDF------提取PDF文字内容

时间：2020-09-04 20:59:48 阅读：76 评论：0 收藏：0 [点我收藏+]

# 安装  pip install pdfplumber
import pdfplumber

# 利用pdfplumber提取文字
with pdfplumber.open(‘基于python的网页爬虫.pdf‘) as pdf:
    first_page = pdf.pages[0]
    print(first_page.extract_text())


# 利用pdfplumber单个提取表格
with pdfplumber.open(‘基于python的网页爬虫.pdf‘) as pdf:
    first_page = pdf.pages[0]
    print(first_page.extract_table())


# 利用pdfplumber多个提取表格
with pdfplumber.open(‘基于python的网页爬虫.pdf‘) as pdf:
    first_page = pdf.pages[0]
    for table in first_page.extract_tables():
        print(table)


# 利用pdfplumber单个提取财报  table_settings: 提取表格是的设定
with pdfplumber.open(‘基于python的网页爬虫.pdf‘) as pdf:
    first_page = pdf.pages[0]
    table = first_page.extract_tables(
        table_settings={
            ‘vertical_strategy‘: ‘text‘,
            ‘horizontal_strategy‘: ‘text‘
        }
    )
    new_table = []
    for row in table:
        new_row = []
        # 如果不是空行
        if not ‘‘.join([str(item) for item in row]) == ‘‘:
            # 合并单词
            new_row.append(‘‘.join([str(item) if item else ‘‘ for item in row[:3]]))
            new_row += row[3:]
            new_table.append(new_row)
    print(new_table)

原文：https://www.cnblogs.com/nanamiyi/p/13615665.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)