首页 > 其他 > 详细

爬取廖雪峰的博客

时间:2020-01-13 17:28:32      阅读:111      评论:0      收藏:0      [点我收藏+]
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pdfkit
import os
import re
import time
import sys
import random
sys.path.append('../' )
from mytools import mail
import logging
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent

# 环境配置
log_path = '/home/jiangwenwen/python/log/'
if not os.path.exists(log_path):
    os.makedirs(log_path)

# logger配置
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s')
file_handler = logging.FileHandler('/home/jiangwenwen/python/log/liaoxuefeng.log', encoding='utf-8')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

ua = UserAgent()

headers = {
    "Host": "www.liaoxuefeng.com",
    "User-Agent": ua.random,
    "Referer": "https://www.liaoxuefeng.com/wiki/1252599548343744",
}

# 执行的JavaScript脚本(懒加载对应)
run_script = "$(function () { $('[data-src]').each(function () { $(this).attr('src', $(this).attr('data-src')); })})"

options = {
    # Wait some milliseconds for javascript finish (default 200)
    '--javascript-delay': '5000',
    '--run-script': run_script
}


def save_pdf(url, category):
    try:
        response = requests.get('https://httpbin.org/ip')
        logger.info('Your IP is {0}'.format(response.json()['origin']))
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        for child in soup.find_all("a", class_="x-wiki-index-item"):
            # 下载链接
            url = "https://www.liaoxuefeng.com" + child.get('href')
            # 文件夹不存在新建文件夹
            file_path = "/home/jiangwenwen/liaoxuefeng/" + category + "/"
            if not os.path.exists(file_path):
                os.makedirs(file_path)
            # 文件绝对路径
            file_name = file_path + child.string.replace('/', '/').replace('\\', '\') + ".pdf"
            # 文件不存在时下载
            if not os.path.exists(file_name):
                pdfkit.from_url(url, file_name, options=options)
                logger.info(file_name + u'下载成功')
                time.sleep(random.randint(720, 1200))
    except Exception as e:
        mail.sendMail('廖雪峰的官方网站:' + str(e))
        logger.exception(str(e))


# java下载
save_pdf("https://www.liaoxuefeng.com/wiki/1252599548343744", "java")

# python下载
save_pdf("https://www.liaoxuefeng.com/wiki/1016959663602400", "python")

# JavaScript下载
save_pdf("https://www.liaoxuefeng.com/wiki/1022910821149312", "JavaScript")

# SQL下载
save_pdf("https://www.liaoxuefeng.com/wiki/1177760294764384", "sql")

# git下载
save_pdf("https://www.liaoxuefeng.com/wiki/896043488029600", "git")

logger.info('下载成功!!!')



爬取廖雪峰的博客

原文:https://www.cnblogs.com/jiangwenwen1/p/12188092.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!