首页 > 编程语言 > 详细

爬虫系列---多线程爬取实例

时间:2019-02-27 22:19:33      阅读:258      评论:0      收藏:0      [点我收藏+]

1.爬取站长图片源码

#爬取站长‘http://sc.chinaz.com/tupian/gudianmeinvtupian.html‘,所有的古典美女图片
import os
import time
import random
import requests
from lxml import etree
from multiprocessing.dummy import Pool
#获取所有页面的url
url =http://sc.chinaz.com/tupian/gudianmeinvtupian.html
page_url_list=[fhttp://sc.chinaz.com/tupian/gudianmeinvtupian_{i}.html for i in range(2,7)]
page_url_list.insert(0,url)

headers={
    User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.20 Safari/537.36,
    # ‘Content-Encoding‘:‘gzip‘,
    # ‘Content-Type‘: ‘text/html‘,
}
pig_url_list = []
def get_pig_url(url):
    response = requests.get(url=url, headers=headers)
    #xpath解析数据
    tree = etree.HTML(response.content.decode())
    div_list = tree.xpath(//div[@id="container"]/div)
    for div in div_list:
        url = div.xpath(.//img/@src2)[0]
        pig_url_list.append(url)

def download(url):
    ‘‘‘下载图片数据‘‘‘
    return requests.get(url=url,headers=headers).content

def save_pig(data):
    ‘‘‘保存图片‘‘‘
    # name=url.split(‘/‘)[-1]
    name=str(random.randrange(0,1000000))+.jpg #线程存储文件名需改善
    path=zhanzhangpig/+name
    with open(path,wb) as f:
        f.write(data)

if not os.path.exists(zhanzhangpig):
    os.makedirs(zhanzhangpig)
# 使用线程池
print(多线程爬取开始)
start_time=time.time()
pool=Pool(8)
pool.map(get_pig_url,page_url_list)
data_list=pool.map(download,pig_url_list)
pool.map(save_pig,data_list)
#关闭线程池
end_time=time.time()
print(多线程爬取结束)
print(耗时:,end_time-start_time)

pool.close()
pool.join()

技术分享图片

技术分享图片

 

2 爬取妹子网图片(https://www.mzitu.com/tag/ugirls/)

import os
import time
import random
import requests
from lxml import etree
from multiprocessing.dummy import Pool
session=requests.session()
if not os.path.exists(meizitu):
    os.makedirs(meizitu)

url=https://www.mzitu.com/tag/ugirls/
page_url_list=[fhttps://www.mzitu.com/tag/ugirls/page/{i}/ for i in range(2,17)]
page_url_list.insert(0,url)

headers={
    User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36,
    Upgrade-Insecure-Requests: 1,
    Referer: https://www.mzitu.com/tag/ugirls/ # 反爬机制:需携带网页请求的原地址
}
pig_url_list = []
def get_pig_url(url):
    response = session.get(url=url, headers=headers)
    # print(response.text)
    #xpath解析数据
    tree = etree.HTML(response.content.decode())
    div_list = tree.xpath(//ul[@id="pins"]/li)
    for div in div_list:
        url = div.xpath(.//img/@data-original)[0]
        pig_url_list.append(url)

def download(url):
    ‘‘‘下载图片数据‘‘‘
    # print(url)
    return session.get(url=url,headers=headers).content

def save_pig(data):
    ‘‘‘保存图片‘‘‘
    name=str(random.randrange(0,1000000))+.jpg #线程存储文件名需改善
    path=meizitu/+name
    with open(path,wb) as f:
        f.write(data)

print(多线程爬取开始)
start_time=time.time()
#开启线程
pool=Pool(10)
# pig_url_list=get_pig_url(url=url) #单页爬取
#多页爬取

pool.map(get_pig_url,page_url_list)
# print(pig_url_list)
data_list=pool.map(download,pig_url_list)
pool.map(save_pig,data_list)

pool.close()
pool.join()
#关闭线程池
end_time=time.time()
print(多线程爬取结束)
print(耗时:,end_time-start_time)
#--------------------统计文件夹中文件个数-----------------
print(len(os.listdir(./meizitu)))

技术分享图片

!!!384张美图等你拿

技术分享图片

 

爬虫系列---多线程爬取实例

原文:https://www.cnblogs.com/angle6-liu/p/10439624.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!