首页 > 编程语言 > 详细

[Python爬虫]头条图集爬取

时间:2019-09-24 14:44:23      阅读:337      评论:0      收藏:0      [点我收藏+]
import requests
from urllib.parse import urlencode
import os
from hashlib import md5
from multiprocessing.pool import Pool

def get_page(offset):
    headers={
        user-agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36,
        cookie:tt_webid=6724223385113069069; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6724223385113069069; csrftoken=9e9d6c3be6aabc313dce0c4f1a116047; sso_uid_tt=27219b1c2d00b8a6021444d85d83dc38; toutiao_sso_user=7562e682c093b193cce298f25dd396ba; login_flag=8391d980bfc8a8908e7c6c80596a016c; __tea_sdk__ssid=undefined; _ga=GA1.2.931504366.1565662966; sid_guard=7562e682c093b193cce298f25dd396ba%7C1565663040%7C5126263%7CFri%2C+11-Oct-2019+10%3A21%3A43+GMT; uid_tt=27219b1c2d00b8a6021444d85d83dc38; sid_tt=7562e682c093b193cce298f25dd396ba; sessionid=7562e682c093b193cce298f25dd396ba; uuid="w:443dcb551552404fbfde212f1054c781"; __tasessionId=i5j7qcydf1569292028372; s_v_web_id=1e7e3b52d7bc46698bb26079c99fd83d,
        pragma:no-cache,
        referer:https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D,
        x-requested-with:XMLHttpRequest
    }
    params={
        aid:24,
        app_name:web_search,
        offset:offset,
        format:json,
        keyword:街拍,
        autoload:true,
        count:20,
        en_qc:1,
        cur_tab:1,
        #‘from‘:‘search_tab‘,
        #‘pd‘:‘synthesis‘,
    }
    print(urlencode(params))
    url=https://www.toutiao.com/api/search/content/?+urlencode(params)
    try:
        response=requests.get(url,headers=headers)
        if response.status_code==200:
            print(response.json())
            return response.json()
    except requests.ConnectionError:
        return No response

def get_image(json):
    if json.get(data):
        for item in json.get(data):
            if title in item and image_list in item and item[image_list]!=[]:
                title=item.get(title)
                images=item.get(image_list)
                for image in images:
                    print(title)
                    print(image)
                    yield {
                        image:image.get(url),
                        title:title
                    }
    else:
        print(Not parse)
def save_image(item):
    if not os.path.exists(item.get(title)):
        os.mkdir(item.get(title))
    try:
        response=requests.get(item.get(image))
        if response.status_code==200:
            file_path={0}/{1}.{2}.format(item.get(title),md5(response.content).hexdigest(),jpg)
            if not os.path.exists(file_path):
                with open(file_path,wb) as f:
                    f.write(response.content)
            else:
                print(Already Downloaded,file_path)
    except requests.ConnectionError:
        print(Failed to Save Image)

def main(offset):
    json=get_page(offset)
    for item in get_image(json):
        print(item)
        save_image(item)
GROUP_START=1
GROUP_END=1
if __name__==__main__:
    pool=Pool()
    groups=([x*20 for x in range(GROUP_START,GROUP_END+1)])
    pool.map(main,groups)
    pool.close()
    pool.join()

 

[Python爬虫]头条图集爬取

原文:https://www.cnblogs.com/lightmonster/p/11577909.html

(1)
(1)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!