首页 > 其他 > 详细

requests+re+multiprocessing爬取猫眼电影top100

时间:2019-04-05 14:34:50      阅读:136      评论:0      收藏:0      [点我收藏+]
技术分享图片
import re
import json
import requests
from multiprocessing import Pool
from requests.exceptions import RequestException


def get_one_page(url):
    """
    获取单页面信息
    :param url:
    :return:
    """
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None


def parse_one_page(html):
    """
    解析页面信息
    :param html:
    :return:
    """
    pattern = re.compile(<dd>.*?board-index.*?>(\d+)</i>.*?poster-default.*?src="(.*?)"
                         .*?name"><a.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)
                         </p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>, re.S)
    items = re.findall(pattern, html)
    for item in items:
        yield  {
            "index": item[0],
            "image": item[1],
            "title": item[2],
            "star": item[3].strip()[3:],
            "time": item[4].strip()[5:],
            "score": item[5]+item[6]
        }


def save_to_file(content):
    """
    将信息保存到文件中
    :param content:
    :return:
    """
    with open("maoyan.txt", "a", encoding="utf-8") as f:
        f.write(json.dumps(content, ensure_ascii=False) + "\n")


def main(offset):
    url = "https://maoyan.com/board/4?offset={}".format(offset)
    html = get_one_page(url)
    for item in parse_one_page(html):
        save_to_file(item)


if __name__ == "__main__":
    # for i in range(10):
    #     main(i*10)

    # 使用多进程请求多个url来减少网络等待浪费的时间
    # map默认异步执行任务、自带close和join功能
    pool = Pool()
    pool.map(main, [i*10 for i in range(10)])
View Code

 

requests+re+multiprocessing爬取猫眼电影top100

原文:https://www.cnblogs.com/believepd/p/10658628.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!