爬取廖雪峰的博客

时间：2020-01-13 17:28:32 阅读：112 评论：0 收藏：0 [点我收藏+]

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pdfkit
import os
import re
import time
import sys
import random
sys.path.append('../' )
from mytools import mail
import logging
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent

# 环境配置
log_path = '/home/jiangwenwen/python/log/'
if not os.path.exists(log_path):
    os.makedirs(log_path)

# logger配置
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s')
file_handler = logging.FileHandler('/home/jiangwenwen/python/log/liaoxuefeng.log', encoding='utf-8')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

ua = UserAgent()

headers = {
    "Host": "www.liaoxuefeng.com",
    "User-Agent": ua.random,
    "Referer": "https://www.liaoxuefeng.com/wiki/1252599548343744",
}

# 执行的JavaScript脚本(懒加载对应)
run_script = "$(function () { $('[data-src]').each(function () { $(this).attr('src', $(this).attr('data-src')); })})"

options = {
    # Wait some milliseconds for javascript finish (default 200)
    '--javascript-delay': '5000',
    '--run-script': run_script
}


def save_pdf(url, category):
    try:
        response = requests.get('https://httpbin.org/ip')
        logger.info('Your IP is {0}'.format(response.json()['origin']))
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        for child in soup.find_all("a", class_="x-wiki-index-item"):
            # 下载链接
            url = "https://www.liaoxuefeng.com" + child.get('href')
            # 文件夹不存在新建文件夹
            file_path = "/home/jiangwenwen/liaoxuefeng/" + category + "/"
            if not os.path.exists(file_path):
                os.makedirs(file_path)
            # 文件绝对路径
            file_name = file_path + child.string.replace('/', '／').replace('\\', '＼') + ".pdf"
            # 文件不存在时下载
            if not os.path.exists(file_name):
                pdfkit.from_url(url, file_name, options=options)
                logger.info(file_name + u'下载成功')
                time.sleep(random.randint(720, 1200))
    except Exception as e:
        mail.sendMail('廖雪峰的官方网站:' + str(e))
        logger.exception(str(e))


# java下载
save_pdf("https://www.liaoxuefeng.com/wiki/1252599548343744", "java")

# python下载
save_pdf("https://www.liaoxuefeng.com/wiki/1016959663602400", "python")

# JavaScript下载
save_pdf("https://www.liaoxuefeng.com/wiki/1022910821149312", "JavaScript")

# SQL下载
save_pdf("https://www.liaoxuefeng.com/wiki/1177760294764384", "sql")

# git下载
save_pdf("https://www.liaoxuefeng.com/wiki/896043488029600", "git")

logger.info('下载成功！！！')

爬取廖雪峰的博客

原文：https://www.cnblogs.com/jiangwenwen1/p/12188092.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)