0.
https://github.com/lzjun567/crawler_html2pdf
wkhtmltopdf 就是一个非常好的工具,它可以用适用于多平台的 html 到 pdf 的转换,pdfkit 是 wkhtmltopdf 的Python封装包。
https://www.crummy.com/software/BeautifulSoup/bs4/doc/#
也可以通过 BeautifulSoup 插入删除tag
soup.insert
soup.decompose
https://wkhtmltopdf.org/downloads.html
下载版本 Windows (MinGW) 0.12.4 32-bit / 64-bit for Windows XP/2003 or later; standalone
添加路径 D:\Program Files\wkhtmltopdf\bin
需要重新打开cmd以及notepad++。。。
pip install pdfkit
API https://pypi.python.org/pypi/pdfkit
定制options,搜索关键字 https://wkhtmltopdf.org/usage/wkhtmltopdf.txt
options = { ‘page-size‘: ‘Letter‘, ‘margin-top‘: ‘0.75in‘, ‘margin-right‘: ‘0.75in‘, ‘margin-bottom‘: ‘0.75in‘, ‘margin-left‘: ‘0.75in‘, ‘encoding‘: "UTF-8", #支持中文 ‘custom-header‘ : [ (‘Accept-Encoding‘, ‘gzip‘) ] ‘cookie‘: [ (‘cookie-name1‘, ‘cookie-value1‘), (‘cookie-name2‘, ‘cookie-value2‘), ], ‘no-outline‘: None } pdfkit.from_url(‘http://google.com‘, ‘out.pdf‘, options=options)
In [323]: urlparse.urljoin(‘https://doc.scrapy.org/en/latest/index.html‘, ‘intro/overview.html‘) #相当于 ./intro/overview.html,其中 . 指代当前文件夹 latest Out[323]: ‘https://doc.scrapy.org/en/latest/intro/overview.html‘ In [324]: urlparse.urljoin(‘https://doc.scrapy.org/en/latest/intro/overview.html‘, ‘#walk-through-of-an-example-spider‘) #当前网页某个tag id=walk-through-of-an-example-spider Out[324]: ‘https://doc.scrapy.org/en/latest/intro/overview.html#walk-through-of-an-example-spider‘ In [326]: urlparse.urljoin(‘https://doc.scrapy.org/en/latest/intro/overview.html‘, ‘install.html‘) #相当于 ./install.html Out[326]: ‘https://doc.scrapy.org/en/latest/intro/install.html‘ In [327]: urlparse.urljoin(‘https://doc.scrapy.org/en/latest/intro/overview.html‘, ‘../topics/commands.html‘) # .. 指代当前文件夹intro的上一层文件夹latest Out[327]: ‘https://doc.scrapy.org/en/latest/topics/commands.html‘
https://doc.scrapy.org/en/latest/index.html
这一类官方文档一般页脚都为:
? Copyright 2008-2016, Scrapy developers. Revision 65ac0b06
.
Built with Sphinx using a theme provided by Read the Docs.
#!usr/bin/env python #coding:utf-8 import os import sys import traceback import re import urlparse import threading import Queue import requests from scrapy import Selector import pdfkit s = requests.Session() # s.headers.update({‘user-agent‘:‘Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_5 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Mobile/13G36 MicroMessenger/6.5.12 NetType/4G‘}) s.headers.update({‘user-agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36‘}) # s.headers.update({‘Referer‘:‘https://servicewechat.com/wx55b926152a8c3bef/14/page-frame.html‘}) s.verify = False s.mount(‘https://‘, requests.adapters.HTTPAdapter(pool_connections=1000, pool_maxsize=1000)) import copy sp = copy.deepcopy(s) proxies = {‘http‘: ‘http://127.0.0.1:1080‘, ‘https‘: ‘https://127.0.0.1:1080‘} sp.proxies = proxies from urllib3.exceptions import InsecureRequestWarning from warnings import filterwarnings filterwarnings(‘ignore‘, category = InsecureRequestWarning) html_template = u""" <!DOCTYPE html> <html> <head> <meta charset="utf-8" /> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> </head> <body> <!-- <center><h1>{title}</h1></center> --> {content} </body> </html> """ # https://wkhtmltopdf.org/usage/wkhtmltopdf.txt options = { ‘page-size‘: ‘A4‘, # Letter ‘minimum-font-size‘: 25, ### # ‘image-dpi‘:1500, ### ‘margin-top‘: ‘0.1in‘, #0.75in ‘margin-right‘: ‘0.1in‘, ‘margin-bottom‘: ‘0.1in‘, ‘margin-left‘: ‘0.1in‘, ‘encoding‘: ‘UTF-8‘, #支持中文 ‘custom-header‘: [ (‘Accept-Encoding‘, ‘gzip‘) ], ‘cookie‘: [ (‘cookie-name1‘, ‘cookie-value1‘), (‘cookie-name2‘, ‘cookie-value2‘), ], ‘outline-depth‘: 10, } class HTMLtoPDF(object): def __init__(self, seed_url, font_size=25, css_links=‘div[class="wy-menu wy-menu-vertical"] a::attr(href)‘, css_content=‘div.rst-content‘, threads_count=30): self.seed_url = seed_url options[‘minimum-font-size‘] = font_size self.netloc = urlparse.urlparse(seed_url).netloc print self.netloc self.folder = os.path.join(sys.path[0], self.netloc) self.folder_temp = os.path.join(sys.path[0], ‘temp‘) for f in [self.folder, self.folder_temp]: if not os.path.isdir(f): os.mkdir(f) self.css_content = css_content self.css_links = css_links self.threads_count = threads_count # self.lock = threading.Lock() self.links_queue = Queue.Queue() self.links_queue.put((‘0‘, self.seed_url)) self.get_links() self.htmls_saved = [] # 验证 re.findall(pattern, s) # <img alt="_images/chapt3_img05_IDE_open.png" class="align-center" src="_images/chapt3_img05_IDE_open.png"> self.img_scr_pattern = re.compile(r‘(<img\s+[^>]*?src\s*=\s*")(?P<src>.*?)(".*?>)‘) #最后不能简写成 " ,否则结果缺 " # <a class="reference external" href="http://code.google.com/p/selenium/issues/detail?id=1008">issue 1008</a> # text为空,也能匹配到 m.group(4)=‘‘ self.a_href_pattern = re.compile(r‘(<a\s+[^>]*?href\s*=\s*")(?P<href>.*?)(".*?>)(?P<text>.*?)(</a>)‘) # http://www.seleniumhq.org/docs/ 合体。。。text为 <img alt="openqa.org logo" id="footerLogo" src="/images/openqa-logo.png"/> # <a href="http://openqa.org/"><img alt="openqa.org logo" id="footerLogo" src="/images/openqa-logo.png"/></a> def get_links(self): text = self.load_page(self.seed_url) sel = Selector(text=text) # [u‘#selenium-documentation‘, # u‘00_Note_to-the-reader.jsp‘, # u‘01_introducing_selenium.jsp‘, # u‘01_introducing_selenium.jsp#test-automation-for-web-applications‘, # links = [re.sub(r‘#.*$‘,‘‘, i) for i in sel.css(‘div[class="toctree-wrapper compound"] a::attr(href)‘).extract()] links = [re.sub(r‘#.*$‘,‘‘, i) for i in sel.css(self.css_links).extract()] links_seen = [self.seed_url] for link in links: #set(links) 会导致乱序,使用urls_seen 去重 link_abs = urlparse.urljoin(self.seed_url, link) if link_abs not in links_seen: self.links_queue.put((str(len(links_seen)), link_abs)) links_seen.append(link_abs) def save_html(self): while True: try: (num, url) = self.links_queue.get() text = self.load_page(url) title, content = self.parse_page(url, text) filename_cn = u‘{}_{}.html‘.format(num, re.sub(ur‘[^\u4e00-\u9fa5\w\s()_-]‘, ‘‘, title)) #ur filename = u‘{}_{}.html‘.format(num, re.sub(r‘[^\w\s()_-]‘, ‘‘, title)) #os.path.abspath(‘en/abc.html‘)合成路径 不能是 /en。。 with open(os.path.join(self.folder, filename_cn),‘wb‘) as fp: fp.write(text.encode(‘utf-8‘,‘replace‘)) f = os.path.join(self.folder_temp, filename) with open(f,‘wb‘) as fp: fp.write(content.encode(‘utf-8‘,‘replace‘)) # fp.write(html_template.format(content=content, title=title).encode(‘utf-8‘,‘replace‘)) self.htmls_saved.append(f) print ‘{}/{}‘.format(len(self.htmls_saved), self.links_queue.qsize()) self.links_queue.task_done() except Exception as err: print ‘{} {} {}‘.format(url, err, traceback.format_exc()) def run(self): threads = [] for i in range(self.threads_count): t = threading.Thread(target=self.save_html) threads.append(t) for t in threads: t.setDaemon(True) t.start() self.links_queue.join() print ‘load done‘ def func(filename): _, filename =os.path.split(filename) return int(filename[:filename.index(‘_‘)]) self.htmls_saved.sort(key=lambda x:func(x)) pdfkit.from_file(self.htmls_saved, self.netloc+‘.pdf‘, options=options) print self.netloc, ‘pdf done‘ def load_page(self, url): resp = sp.get(url) ############### if resp.encoding == ‘ISO-8859-1‘: encodings = requests.utils.get_encodings_from_content(resp.content) #re.compile(r‘<meta.*?charset if encodings: resp.encoding = encodings[0] else: resp.encoding = resp.apparent_encoding #models.py chardet.detect(self.content)[‘encoding‘] # print ‘ISO-8859-1 changed to %s‘%resp.encoding return resp.text def parse_page(self, url, text): sel = Selector(text=text) title = sel.css(‘head title::text‘).extract_first() or ‘‘ #固定css content = sel.css(self.css_content).extract_first() or ‘‘ #‘div.rst-content‘ # sel = sel.css("div#rst-content")[0] ###缩小范围 content = self.clean_content(content) content = self.modify_content(url, content) return title, content def clean_content(self, content): sel = Selector(text=content) # content = content.replace(sel.css(‘div#codeLanguagePreference‘).extract_first(), ‘‘) #可能是None for div in sel.css(‘div#codeLanguagePreference‘).extract(): content = content.replace(div, ‘‘) for lang in [‘java‘, ‘csharp‘, ‘ruby‘, ‘php‘, ‘perl‘, ‘javascript‘]: for div in sel.css(‘div.highlight-%s‘%lang).extract(): # print len(content) content = content.replace(div, ‘‘) return content def modify_content(self, url, content): # m.group(1)=‘abc‘ SyntaxError: can‘t assign to function call 不能直接赋值 # https://doc.scrapy.org/en/latest/topics/firebug.html # ../_images/firebug1.png # 异常 urlparse.urljoin(self.seed_url, src) # r‘(<img\s+[^>]*?src\s*=\s*")(?P<src>.*?)(".*?>)‘ def func_src(m): src = m.group(‘src‘) #别名 if not src.startswith(‘http‘): src = urlparse.urljoin(url, src) return u‘{}{}{}‘.format(m.group(1), src, m.group(3)) content = re.sub(self.img_scr_pattern, func_src, content) # re.compile(r‘(<a\s+[^>]*?href\s*=\s*")(?P<href>.*?)(".*?>)(?P<text>.*?)(</a>)‘) def func_href(m): href = m.group(‘href‘) text = m.group(‘text‘) if not href.startswith(‘#‘): if not href.startswith(‘http‘): href = urlparse.urljoin(url, href) text = u‘{text} ({href})‘.format(text=text, href=href) return u‘{g1}{href}{g3}{text}{g5}‘.format(g1=m.group(1), g3=m.group(3), g5=m.group(5), href=href, text=text) #m.string是content全文。。。也不能 return m content = re.sub(self.a_href_pattern, func_href, content) return content def modify_content2(self, url, content): sel = Selector(text=content) # 修改图片链接为绝对链接,否则pdf无法图片 # <img alt="_images/chapt3_img05_IDE_open.png" class="align-center" src="_images/chapt3_img05_IDE_open.png"> for i in sel.css(‘img[src]‘): tag = i.extract() src = i.xpath(‘./@src‘).extract_first() if not src.startswith(‘http‘): src_abs = urlparse.urljoin(url, src) # print src, src_abs tag_new = tag.replace(src, src_abs) content = content.replace(tag, tag_new) #可能alt(同src...) # a href 的text添加href信息 # <a class="reference external" href="http://code.google.com/p/selenium/issues/detail?id=1008">issue 1008</a> for i in sel.css(‘a[href]‘): tag = i.extract() href = i.xpath(‘./@href‘).extract_first() text = i.xpath(‘./text()‘).extract_first() # 补全内部链接,忽略本页面的#定位 if not href.startswith(‘http‘) and not href.startswith(‘#‘): href_abs = urlparse.urljoin(url, href) # print href, href_abs tag_new = tag.replace(href, href_abs) else: href_abs = href tag_new = tag # 图标链接,如果text为None,replace表现异常 if text and not href.startswith(‘#‘): text_new = u‘{} ({})‘.format(text, href_abs) # print text.encode(‘gbk‘,‘replace‘), text_new.encode(‘gbk‘,‘replace‘) tag_new = tag_new.replace(text, text_new) # 保证整体替换 content = content.replace(tag, tag_new) return content if __name__ == ‘__main__‘: url = ‘https://doc.scrapy.org/en/latest/index.html‘ # obj = HTMLtoPDF(url) url = ‘http://python3-cookbook.readthedocs.io/zh_CN/latest/index.html‘ # obj = HTMLtoPDF(url, font_size=20, css_links=‘div[class="toctree-wrapper compound"] a::attr(href)‘) url = ‘http://www.seleniumhq.org/docs/‘ obj = HTMLtoPDF(url, css_links=‘div#selenium-documentation a::attr(href)‘, css_content=‘div#mainContent‘) obj.run()
原文:http://www.cnblogs.com/my8100/p/7738366.html