首页 > 编程语言 > 详细

dummy多线程爬取梨视频例子

时间:2020-08-26 19:25:20      阅读:106      评论:0      收藏:0      [点我收藏+]
# _*_ coding:utf-8 _*_
"""
@FileName   :2.梨视频数据爬取.py
@CreateTime :2020/8/26 0026 15:26
@Author     : Lurker Zhang
@E-mail     : 289735192@qq.com
@Desc.      : 爬取梨视频音乐版块视频,地址:https://www.pearvideo.com/popular_59
"""
import atexit

from setting.config import *
from lxml import etree
import requests
import json
import re
import time
import os
from multiprocessing.dummy import Pool


def main():
    # 设置获取视频数:n*12
    n = 1
    global all_id_list, id_list
    get_id_pool = Pool(4)
    get_id_pool.map(get_id_list, [start for start in range(0, n * 12, 12)])
    down_video_pool = Pool(4)
    down_video_pool.map(down_video, all_id_list)


@atexit.register
def renew_id_list():
    print(采集完成,本地成功下载{0}个视频,失败{1}视频。.format(total_success, total_fail))
    # 存储已下载文件名列表:
    with open("../depository/pear_video/down_id.json", w, encoding=utf-8) as fp:
        json.dump(id_list, fp)


def get_id_list(start):
    """
    get 12 video id list
    :param start: start
    :return: video id list
    """
    global all_id_list
    print("解析strart={}的12个视频ID".format(start))
    url = https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=59&start={}.format(start)
    # 获取视频列表页面,从start开始,返回了12个视频
    video_list_page = requests.get(url=url, headers=headers).text
    tree = etree.HTML(video_list_page)
    all_id_list += [url.split(_)[1] for url in tree.xpath(/html/body/li/div/a/@href)]


def down_video(data_id):
    """
    Download video content according to video ID
    :param data_id: the video id for download
    :return: None
    """
    global path, id_list,total_fail,total_success
    print("开始下载保存data_id={}的视频".format(data_id))
    url = "https://www.pearvideo.com/video_{}".format(data_id)
    if data_id in id_list:
        total_fail += 1
        print(data_id, "已经下载过了,跳过!")
    else:
        video_preview_page = requests.get(url=url, headers=headers).text
        # 获取该视频的视频源地址
        """
        分析网页,视频地址为动态加载的,视频源地址在javascript代码中
        用re进行获取
        """
        url_ex = srcUrl="(.*?)"
        title_ex = "video-tt">(.*?)</h1>
        try:
            video_url = re.findall(url_ex, video_preview_page)[0]
            video_title = re.findall(title_ex, video_preview_page)[0]
        except Exception:
            return 0
        # 下载视频
        video_content = requests.get(url=video_url, headers=headers).content
        try:
            with open(path + video_title + ".mp4", wb) as fp:
                fp.write(video_content)
        except Exception:
            return 0
        else:
            id_list.append(data_id)
            total_success += 1


if __name__ == __main__:
    # 读取已下载视频ID
    if not os.path.exists(../depository/pear_video/down_id.json):
        with open("../depository/pear_video/down_id.json", w, encoding="utf-8") as fp:
            json.dump([], fp)
    with open("../depository/pear_video/down_id.json", "r", encoding="utf-8") as fp:
        id_list = json.load(fp)
    # 设置视频保存位置
    path = ../depository/pear_video/ + time.strftime(%Y%m%d, time.localtime()) + /
    if not os.path.exists(path):
        os.mkdir(path)
    # 记录本次采集图片的数量
    total_success = 0
    total_fail = 0
    all_id_list = []  # 保存本次要下载的视频ID
    main()

技术分享图片

 

dummy多线程爬取梨视频例子

原文:https://www.cnblogs.com/lurkerzhang/p/13566907.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!