html2doc

时间：2017-10-28 19:38:36 阅读：617 评论：0 收藏：0 [点我收藏+]

1.参考

Python 爬虫：把廖雪峰教程转换成 PDF 电子书

https://github.com/lzjun567/crawler_html2pdf

wkhtmltopdf 就是一个非常好的工具，它可以用适用于多平台的 html 到 pdf 的转换，pdfkit 是 wkhtmltopdf 的Python封装包。

https://www.crummy.com/software/BeautifulSoup/bs4/doc/#

也可以通过 BeautifulSoup 插入删除tag

soup.insert
soup.decompose

2.安装

https://wkhtmltopdf.org/downloads.html

下载版本 Windows (MinGW) 0.12.4 32-bit / 64-bit for Windows XP/2003 or later; standalone

添加路径 D:\Program Files\wkhtmltopdf\bin

需要重新打开cmd以及notepad++。。。

pip install pdfkit

API https://pypi.python.org/pypi/pdfkit

定制options，搜索关键字 https://wkhtmltopdf.org/usage/wkhtmltopdf.txt

options = {
    ‘page-size‘: ‘Letter‘,
    ‘margin-top‘: ‘0.75in‘,
    ‘margin-right‘: ‘0.75in‘,
    ‘margin-bottom‘: ‘0.75in‘,
    ‘margin-left‘: ‘0.75in‘,
    ‘encoding‘: "UTF-8",  #支持中文
    ‘custom-header‘ : [
        (‘Accept-Encoding‘, ‘gzip‘)
    ]
    ‘cookie‘: [
        (‘cookie-name1‘, ‘cookie-value1‘),
        (‘cookie-name2‘, ‘cookie-value2‘),
    ],
    ‘no-outline‘: None
}

pdfkit.from_url(‘http://google.com‘, ‘out.pdf‘, options=options)

3.背景知识

3.1url 相对路径绝对路径

In [323]: urlparse.urljoin(‘https://doc.scrapy.org/en/latest/index.html‘, ‘intro/overview.html‘)  #相当于 ./intro/overview.html，其中 . 指代当前文件夹 latest
Out[323]: ‘https://doc.scrapy.org/en/latest/intro/overview.html‘

In [324]: urlparse.urljoin(‘https://doc.scrapy.org/en/latest/intro/overview.html‘, ‘#walk-through-of-an-example-spider‘)  #当前网页某个tag id=walk-through-of-an-example-spider
Out[324]: ‘https://doc.scrapy.org/en/latest/intro/overview.html#walk-through-of-an-example-spider‘

In [326]: urlparse.urljoin(‘https://doc.scrapy.org/en/latest/intro/overview.html‘, ‘install.html‘)  #相当于 ./install.html
Out[326]: ‘https://doc.scrapy.org/en/latest/intro/install.html‘

In [327]: urlparse.urljoin(‘https://doc.scrapy.org/en/latest/intro/overview.html‘, ‘../topics/commands.html‘)  # .. 指代当前文件夹intro的上一层文件夹latest
Out[327]: ‘https://doc.scrapy.org/en/latest/topics/commands.html‘

https://doc.scrapy.org/en/latest/index.html

这一类官方文档一般页脚都为：

Built with Sphinx using a theme provided by Read the Docs.

3.2页面布局规律

点击左上角 home 图标转到首页
左边栏页面导航
- <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
  - <a class="reference internal" href="intro/overview.html">Scrapy at a glance</a></li>
正文主体
- <div class="rst-content">

3.2转换pdf时注意事项

提取正文主体后，可以直接将 <div xxxx </div> 保存html，不需要补全 <html>
图片链接相对路径需要转换为绝对路径，才会自动加载图片
- <img alt="Inspecting elements with Firebug" src="../_images/firebug1.png" style="width: 913px; height: 600px;">
pdfkit.from_file 第一个参数 input 为 html文件路径列表，文件名不能是中文。。。
- pdfkit.from_file(self.htmls_saved, self.netloc+‘.pdf‘, options=options)
pdf会根据<h1> <h2>等标题 tag 自动生成目录

4.实践代码

#!usr/bin/env python
#coding:utf-8

import os
import sys
import traceback
import re
import urlparse
import threading
import Queue

import requests
from scrapy import Selector
import pdfkit


s = requests.Session()
# s.headers.update({‘user-agent‘:‘Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_5 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Mobile/13G36 MicroMessenger/6.5.12 NetType/4G‘})
s.headers.update({‘user-agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36‘})
# s.headers.update({‘Referer‘:‘https://servicewechat.com/wx55b926152a8c3bef/14/page-frame.html‘})
s.verify = False
s.mount(‘https://‘, requests.adapters.HTTPAdapter(pool_connections=1000, pool_maxsize=1000)) 
import copy
sp = copy.deepcopy(s)
proxies = {‘http‘: ‘http://127.0.0.1:1080‘, ‘https‘: ‘https://127.0.0.1:1080‘}
sp.proxies = proxies 

from urllib3.exceptions import InsecureRequestWarning
from warnings import filterwarnings
filterwarnings(‘ignore‘, category = InsecureRequestWarning)

html_template = u"""
<!DOCTYPE html>

<html>
    <head>
        <meta charset="utf-8" />
        <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    </head>
    <body>
        <!-- <center><h1>{title}</h1></center> -->
        {content}
    </body>
</html>
"""

# https://wkhtmltopdf.org/usage/wkhtmltopdf.txt
options = {
    ‘page-size‘: ‘A4‘,  # Letter
    ‘minimum-font-size‘: 25,  ###
    # ‘image-dpi‘:1500, ###
    
    ‘margin-top‘: ‘0.1in‘,  #0.75in
    ‘margin-right‘: ‘0.1in‘,
    ‘margin-bottom‘: ‘0.1in‘,
    ‘margin-left‘: ‘0.1in‘,
    ‘encoding‘: ‘UTF-8‘,  #支持中文
    ‘custom-header‘: [
        (‘Accept-Encoding‘, ‘gzip‘)
    ],
    ‘cookie‘: [
        (‘cookie-name1‘, ‘cookie-value1‘),
        (‘cookie-name2‘, ‘cookie-value2‘),
    ],
    ‘outline-depth‘: 10,
}


class HTMLtoPDF(object):

    def __init__(self, seed_url, font_size=25, css_links=‘div[class="wy-menu wy-menu-vertical"] a::attr(href)‘,
                css_content=‘div.rst-content‘, threads_count=30):
        self.seed_url = seed_url
        options[‘minimum-font-size‘] = font_size
        
        self.netloc = urlparse.urlparse(seed_url).netloc 
        print self.netloc
        self.folder = os.path.join(sys.path[0], self.netloc)
        self.folder_temp = os.path.join(sys.path[0], ‘temp‘)
        for f in [self.folder, self.folder_temp]:
            if not os.path.isdir(f):
                os.mkdir(f)
        
        self.css_content = css_content
        self.css_links = css_links
        
        self.threads_count = threads_count
        # self.lock = threading.Lock()
        self.links_queue = Queue.Queue()     
        
        self.links_queue.put((‘0‘, self.seed_url))
        self.get_links()
        self.htmls_saved = [] 
        
        # 验证 re.findall(pattern, s)
        # <img alt="_images/chapt3_img05_IDE_open.png" class="align-center" src="_images/chapt3_img05_IDE_open.png">
        self.img_scr_pattern = re.compile(r‘(<img\s+[^>]*?src\s*=\s*")(?P<src>.*?)(".*?>)‘)  #最后不能简写成 " ，否则结果缺 "

        
        # <a class="reference external" href="http://code.google.com/p/selenium/issues/detail?id=1008">issue 1008</a>
        # text为空，也能匹配到 m.group(4)=‘‘
        self.a_href_pattern = re.compile(r‘(<a\s+[^>]*?href\s*=\s*")(?P<href>.*?)(".*?>)(?P<text>.*?)(</a>)‘)
        
        # http://www.seleniumhq.org/docs/ 合体。。。text为 <img alt="openqa.org logo" id="footerLogo" src="/images/openqa-logo.png"/>
        # <a href="http://openqa.org/"><img alt="openqa.org logo" id="footerLogo" src="/images/openqa-logo.png"/></a>

        
    def get_links(self):
        text = self.load_page(self.seed_url)
        sel = Selector(text=text)
        
        # [u‘#selenium-documentation‘,
        # u‘00_Note_to-the-reader.jsp‘,
        # u‘01_introducing_selenium.jsp‘,
        # u‘01_introducing_selenium.jsp#test-automation-for-web-applications‘,    

        # links = [re.sub(r‘#.*$‘,‘‘, i) for i in sel.css(‘div[class="toctree-wrapper compound"] a::attr(href)‘).extract()]        
        links = [re.sub(r‘#.*$‘,‘‘, i) for i in sel.css(self.css_links).extract()]        
        links_seen = [self.seed_url]
        
        for link in links:  #set(links) 会导致乱序,使用urls_seen 去重
            link_abs = urlparse.urljoin(self.seed_url, link)
            if link_abs not in links_seen:
                self.links_queue.put((str(len(links_seen)), link_abs))
                links_seen.append(link_abs)

    def save_html(self):
        while True:
            try:
                (num, url) = self.links_queue.get()
                text = self.load_page(url)
                
                title, content = self.parse_page(url, text)
                
                filename_cn = u‘{}_{}.html‘.format(num, re.sub(ur‘[^\u4e00-\u9fa5\w\s()_-]‘, ‘‘, title))  #ur    
                filename = u‘{}_{}.html‘.format(num, re.sub(r‘[^\w\s()_-]‘, ‘‘, title))  #os.path.abspath(‘en/abc.html‘)合成路径 不能是 /en。。
                
                with open(os.path.join(self.folder, filename_cn),‘wb‘) as fp:
                    fp.write(text.encode(‘utf-8‘,‘replace‘))
                f = os.path.join(self.folder_temp, filename)
                with open(f,‘wb‘) as fp:
                    fp.write(content.encode(‘utf-8‘,‘replace‘)) 
                    # fp.write(html_template.format(content=content, title=title).encode(‘utf-8‘,‘replace‘))
                    self.htmls_saved.append(f)
                    print ‘{}/{}‘.format(len(self.htmls_saved), self.links_queue.qsize())
                    
                self.links_queue.task_done()
            except Exception as err:
                print ‘{} {} {}‘.format(url, err, traceback.format_exc())
                
    def run(self):
        threads = []
        for i in range(self.threads_count):
            t = threading.Thread(target=self.save_html)
            threads.append(t)

        for t in threads:
            t.setDaemon(True) 
            t.start() 
            
        self.links_queue.join()
        print ‘load done‘
        
        def func(filename):
            _, filename =os.path.split(filename)
            return int(filename[:filename.index(‘_‘)])
        
        self.htmls_saved.sort(key=lambda x:func(x))
        pdfkit.from_file(self.htmls_saved, self.netloc+‘.pdf‘, options=options)
        print self.netloc, ‘pdf done‘
        

    def load_page(self, url):
            
        resp = sp.get(url)  ###############

        if resp.encoding == ‘ISO-8859-1‘:
            encodings = requests.utils.get_encodings_from_content(resp.content)  #re.compile(r‘<meta.*?charset
            if encodings:
                resp.encoding = encodings[0]
            else:
                resp.encoding = resp.apparent_encoding  #models.py  chardet.detect(self.content)[‘encoding‘]
            # print ‘ISO-8859-1 changed to %s‘%resp.encoding
            
        return resp.text   

    def parse_page(self, url, text):
        sel = Selector(text=text)

        title = sel.css(‘head title::text‘).extract_first() or ‘‘  #固定css
        content = sel.css(self.css_content).extract_first() or ‘‘  #‘div.rst-content‘
        
        # sel = sel.css("div#rst-content")[0]  ###缩小范围
        content = self.clean_content(content)
        content = self.modify_content(url, content)  
        
        return title, content
  
    def clean_content(self, content):
        sel = Selector(text=content)
        # content = content.replace(sel.css(‘div#codeLanguagePreference‘).extract_first(), ‘‘) #可能是None
        for div in sel.css(‘div#codeLanguagePreference‘).extract():
            content = content.replace(div, ‘‘)
        
        for lang in [‘java‘, ‘csharp‘, ‘ruby‘, ‘php‘, ‘perl‘, ‘javascript‘]:
            for div in sel.css(‘div.highlight-%s‘%lang).extract():
                # print len(content)
                content = content.replace(div, ‘‘)
                
        return content
        
    def modify_content(self, url, content):
        # m.group(1)=‘abc‘ SyntaxError: can‘t assign to function call 不能直接赋值
        
        # https://doc.scrapy.org/en/latest/topics/firebug.html
        # ../_images/firebug1.png
        # 异常 urlparse.urljoin(self.seed_url, src)
        
        # r‘(<img\s+[^>]*?src\s*=\s*")(?P<src>.*?)(".*?>)‘
        def func_src(m):
            src = m.group(‘src‘)  #别名
            if not src.startswith(‘http‘):
                src = urlparse.urljoin(url, src)
            return u‘{}{}{}‘.format(m.group(1), src, m.group(3))

        content = re.sub(self.img_scr_pattern, func_src, content)

        
        # re.compile(r‘(<a\s+[^>]*?href\s*=\s*")(?P<href>.*?)(".*?>)(?P<text>.*?)(</a>)‘)
        def func_href(m):
            href = m.group(‘href‘)
            text = m.group(‘text‘)
            if not href.startswith(‘#‘):
                if not href.startswith(‘http‘):
                    href = urlparse.urljoin(url, href)
                text = u‘{text} ({href})‘.format(text=text, href=href)
            return u‘{g1}{href}{g3}{text}{g5}‘.format(g1=m.group(1), g3=m.group(3), g5=m.group(5), href=href, text=text)
            #m.string是content全文。。。也不能 return m

        content = re.sub(self.a_href_pattern, func_href, content)   

        return content
        
    
    def modify_content2(self, url, content):
        sel = Selector(text=content)

        # 修改图片链接为绝对链接，否则pdf无法图片        
        # <img alt="_images/chapt3_img05_IDE_open.png" class="align-center" src="_images/chapt3_img05_IDE_open.png">
        for i in sel.css(‘img[src]‘):
            tag = i.extract()
            src = i.xpath(‘./@src‘).extract_first()
            if not src.startswith(‘http‘):
                src_abs = urlparse.urljoin(url, src)
                # print src, src_abs
                tag_new = tag.replace(src, src_abs)     
                content = content.replace(tag, tag_new)  #可能alt(同src...)
                
        # a href 的text添加href信息
        # <a class="reference external" href="http://code.google.com/p/selenium/issues/detail?id=1008">issue 1008</a>
        for i in sel.css(‘a[href]‘):
            tag = i.extract()
            href = i.xpath(‘./@href‘).extract_first()
            text = i.xpath(‘./text()‘).extract_first()
            
            # 补全内部链接，忽略本页面的#定位
            if not href.startswith(‘http‘) and not href.startswith(‘#‘):
                href_abs = urlparse.urljoin(url, href)
                # print href, href_abs
                tag_new = tag.replace(href, href_abs)
            else:
                href_abs = href
                tag_new = tag
                
            # 图标链接，如果text为None，replace表现异常
            if text and not href.startswith(‘#‘):
                text_new = u‘{} ({})‘.format(text, href_abs)
                # print text.encode(‘gbk‘,‘replace‘), text_new.encode(‘gbk‘,‘replace‘)
                tag_new = tag_new.replace(text, text_new)          
            
            # 保证整体替换   
            content = content.replace(tag, tag_new)  
        
        return content



if __name__ == ‘__main__‘:
    url = ‘https://doc.scrapy.org/en/latest/index.html‘
    # obj = HTMLtoPDF(url)  
    
    url = ‘http://python3-cookbook.readthedocs.io/zh_CN/latest/index.html‘
    # obj = HTMLtoPDF(url, font_size=20, css_links=‘div[class="toctree-wrapper compound"] a::attr(href)‘) 
    
    url = ‘http://www.seleniumhq.org/docs/‘
    obj = HTMLtoPDF(url, css_links=‘div#selenium-documentation a::attr(href)‘, css_content=‘div#mainContent‘)  
    
    obj.run()

html2doc

原文：http://www.cnblogs.com/my8100/p/7738366.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)

html2doc

1.参考

2.安装

3.背景知识

3.1url 相对路径 绝对路径

3.2页面布局规律

3.2转换pdf时注意事项

4.实践代码

3.1url 相对路径绝对路径