首页 > 其他 > 详细

爬取校花网的视频

时间:2019-08-08 23:51:42      阅读:138      评论:0      收藏:0      [点我收藏+]
from requests_html import HTMLSession
import os
session = HTMLSession()

# http://www.xiaohuar.com/list-3-0.html
#获取索引页url
def get_index_page():
    for i in range(6):
        url = http://www.xiaohuar.com/list-3-%s.html%i
        yield url

#获取
# url= "http://www.xiaohuar.com/list-3-0.html"
# r = session.get(url=url)
# for element in r.html.find(‘#images a[class="imglink"]‘):
#     print(element.attrs.get(‘href‘))


#解析索引页获取详情页url
def get_detail_page(url):
    r = session.get(url=url)
    for element in r.html.find(#images a[class="imglink"]):
        yield element.attrs.get(href)

#测试解析详情页获取视频url,名字
# url = ‘http://www.xiaohuar.com/p-3-136.html‘
# r = session.get(url=url)
# r.html.encoding = "gbk"
# file_name = r.html.find(‘title‘,first=True).text.replace(‘\\‘,‘‘)
# print(file_name)
#
# element = r.html.find(‘#media source‘,first=True)
# if element:
#     mp4_url = element.attrs.get(‘src‘)
# else:
#     m3u8_url = r.html.search(‘var vHLSurl    = "{}";‘)[0]
#     print(m3u8_url)


#解析详情页获取视频url,名字
def get_url_name(url):
    r = session.get(url=url)
    r.html.encoding = "gbk"
    file_name = r.html.find(title,first=True).text.replace(\\,‘‘)
    print(file_name)
    element = r.html.find(#media source,first=True)
    if element:
        vurl = element.attrs.get(src)
        vtype = mp4
    else:
        vurl = r.html.search(var vHLSurl    = "{}";)[0]
        vtype = m3u8
    return file_name,vurl,vtype

#保存文件
def save(file_name,vurl,vtype):
    if vtype == "mp4":
        file_name += ".mp4"
        r = session.get(url=vurl)
        with open(file_name,wb) as f:
            f.write(r.content)
    elif vtype == "m3u8":
        save_m3u8(file_name,vurl)

#处理m3u8
def save_m3u8(file_name,vurl):
    if not os.path.exists(file_name):
        os.mkdir(file_name)
    r = session.get(url=vurl)
    m3u8_path = os.path.join(file_name,playlist.m3u8)
    with open(m3u8_path,wb) as f:
        f.write(r.content)
    for line in r.text:
        if line.endswith(ts):
            ts_url = vurl.replace(playlist.m3u8,line)
            ts_path = os.path.join(file_name,line)
            r0 = session.get(url=ts_url)
            with open(ts_path,wb) as f:
                f.write(r0.content)


if __name__ == __main__:
    for index_page in get_index_page():
        for detail_url in get_detail_page(index_page):
            file_name, vurl, vtype = get_url_name(detail_url)
            save(file_name, vurl, vtype)

#  上述的for循环,是由于yield导致的!建议使用,看起来大气

 

知识点补充:

# print(str(‘电影‘.encode(‘utf-8‘)).strip("b‘").upper().replace(‘\X‘,‘%‘))

#    前端页面对中文的参数的编码原理



视频以m3u8结尾的,需要我门再进一步处理!拿到里面片段的ts文件!

 

爬取校花网的视频

原文:https://www.cnblogs.com/changwenjun-666/p/11324412.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!