首页 > 编程语言 > 详细

python爬虫爬取图片

时间:2020-01-21 12:57:25      阅读:102      评论:0      收藏:0      [点我收藏+]

python爬取天极网图片

使用python爬取天极网图片使用python爬取天极网图片,需要导入requests模块,以及os,bs4模块,获取网页地址,打开HTML页面,分析代码结构,查找图片对应的div标签,对应的class属性,再找每张图片对应的子标签,找出其中包含的img标签,对应的url地址,拿到url之后,使用requests发送请求,将其保存在django项目下的img文件夹中。

# low  版
# 爬取页面显示的所有缩略图

import os
import requests    #发送请求
from bs4 import BeautifulSoup    # 解析文本
base_path = os.path.dirname(os.path.abspath(__file__))
print(base_path)
img_path = os.path.join(base_path,'img')
response = requests.get('http://pic.yesky.com/c/6_20491_1.shtml')

soup = BeautifulSoup(response.text,'html.parser')     # 将请求结果交给BS4解析
div_obj = soup.find(name='div',attrs={'class':'lb_box'})    # 经过分析之后定位到指定div
#从div中找到所有的dl标签(每一张图片的外部标点)
list_dl = div_obj.find_all(name='dl')
for dl in list_dl:    # 每一张图片的dl
    # 从dl中找img,要考虑图片是否唯一
    img = dl.find(name='img')
    #从标签中获取属性,用标签对象点get
    img_src = img.get('src')
    #拿到img的url,使用requests 发请求
    img_response = requests.get(img_src)
    #处理路径和文件名
    file_path = os.path.join(img_path,img_src.rsplit('/',1)[-1])
    with open(file_path,'wb') as f:
        f.write(img_response.content)




# 进阶版
#点击图片后显示的几张缩略图
import os
import requests    #发送请求
from bs4 import BeautifulSoup    # 解析文本
base_path = os.path.dirname(os.path.abspath(__file__))
img_path = os.path.join(base_path,'img')
response = requests.get('http://pic.yesky.com/c/6_20491_1.shtml')

soup = BeautifulSoup(response.text,'html.parser')     # 将请求结果交给BS4解析
div_obj = soup.find(name='div',attrs={'class':'lb_box'})    # 经过分析之后定位到指定div
#从div中找到所有的dl标签(每一张图片的外部标点)

list_dd = div_obj.find_all(name='dd')
for dd in list_dd:    # 每一张图片的dl
    a_obj = dd.find('a')
    # print(a_obj.text)

    # 拼接文件夹的路径,并创建文件夹
    dir_path = os.path.join(img_path,a_obj.text)
    if not os.path.isdir(dir_path):   # 判断文件夹是否存在
        os.mkdir(dir_path)

    a_response = requests.get(a_obj.get('href'))
    a_response.encoding = 'GBK'
    soup2 = BeautifulSoup(a_response.text,'html.parser')
    div_obj2 = soup2.find(name='div',attrs={'class':'overview'})
    # print(div_obj2)
    # try:
    img_list = div_obj2.find_all(name='img')
    for img in img_list:
        img_src = img.get('src')
        img_response = requests.get(img_src)
        file_path = os.path.join(dir_path,img_src.rsplit('/',1)[-1])
        with open(file_path,'wb') as f:
            f.write(img_response.content)
    # except Exception as e:
    break



#高清图
# 每张图片下所有的高清图
import os
import requests    #发送请求
from bs4 import BeautifulSoup    # 解析文本
base_path = os.path.dirname(os.path.abspath(__file__))
img_path = os.path.join(base_path,'img')
response = requests.get('http://pic.yesky.com/c/6_20491_1.shtml')

soup = BeautifulSoup(response.text,'html.parser')     # 将请求结果交给BS4解析
div_obj = soup.find(name='div',attrs={'class':'lb_box'})    # 经过分析之后定位到指定div
#从div中找到所有的dl标签(每一张图片的外部标点)

list_dd = div_obj.find_all(name='dd')
for dd in list_dd:    # 每一张图片的dl
    a_obj = dd.find('a')
    # print(a_obj.text)

    # 拼接文件夹的路径,并创建文件夹
    dir_path = os.path.join(img_path,a_obj.text)
    if not os.path.isdir(dir_path):   # 判断文件夹是否存在
        os.mkdir(dir_path)

    a_response = requests.get(a_obj.get('href'))
    a_response.encoding = 'GBK'
    soup2 = BeautifulSoup(a_response.text,'html.parser')
    div_obj2 = soup2.find(name='div',attrs={'class':'overview'})
    # print(div_obj2)
    try:
        img_list = div_obj2.find_all(name='img')

        for img in img_list:
            img_src = img.get('src')
            img_response = requests.get(img_src.replace('113x113','740x-'))  # 路径替换
            file_path = os.path.join(dir_path,img_src.rsplit('/',1)[-1])
            with open(file_path,'wb') as f:
                f.write(img_response.content)
    except Exception as e:
        pass
多进程/多线程爬取五张页面所用高清图

import threading
import os
import requests  # 发送请求
from bs4 import BeautifulSoup  # 解析文本

from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
from multiprocessing import cpu_count  # 获取本机的CPU核数

base_path = os.path.dirname(os.path.abspath(__file__))
img_path = os.path.join(base_path, 'img')

def picture(num):

    response = requests.get(f'http://pic.yesky.com/c/6_20491_{num}.shtml')

    soup = BeautifulSoup(response.text, 'html.parser')  # 将请求结果交给BS4解析
    div_obj = soup.find(name='div', attrs={'class': 'lb_box'})  # 经过分析之后定位到指定div
    # 从div中找到所有的dl标签(每一张图片的外部标点)

    list_dd = div_obj.find_all(name='dd')
    for dd in list_dd:  # 每一张图片的dl
        a_obj = dd.find('a')
        # 拼接文件夹的路径,并创建文件夹
        dir_path = os.path.join(img_path, a_obj.text)
        if not os.path.isdir(dir_path):  # 判断文件夹是否存在
            os.mkdir(dir_path)
        a_response = requests.get(a_obj.get('href'))
        a_response.encoding = 'GBK'
        soup2 = BeautifulSoup(a_response.text, 'html.parser')
        div_obj2 = soup2.find(name='div', attrs={'class': 'overview'})
        # print(div_obj2)
        try:
            img_list = div_obj2.find_all(name='img')

            for img in img_list:
                img_src = img.get('src')
                img_response = requests.get(img_src.replace('113x113', '740x-'))  # 路径替换
                file_path = os.path.join(dir_path, img_src.rsplit('/', 1)[-1])
                with open(file_path, 'wb') as f:
                    f.write(img_response.content)
        except Exception as e:
            pass


if __name__ == "__main__":
    import time
    start = time.time()

    # 进程池
    # p = ProcessPoolExecutor(max_workers=cpu_count())
    # # print(cpu_count())
    # for i in range(1,6):
    #     p.submit(func, i)
    # p.shutdown()

    # 线程池
    t = ThreadPoolExecutor(max_workers=cpu_count())
    for i in range(1,6):
        t.submit(picture,i)
    t.shutdown()
    print('执行时间:{}'.format(time.time()-start))

    # for i in range(1,6):
    #     a = threading.Thread(target=picture,args=(i,))
    #     a.start()

python爬虫爬取图片

原文:https://www.cnblogs.com/shenzewang/p/12221108.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!