(二)批量下载百度网站图片

时间：2020-02-14 19:17:31 阅读：228 评论：0 收藏：0 [点我收藏+]

批量下载百度网站图片

标签（空格分隔）： python

获取图片的url链接

首先，打开百度图片首页，注意下图url中的index，将index修改成flip即可把瀑布流页面切换成传统翻页版（flip），这样有利于观察不同页数的url的规律。

对比不同页数的url可发现：pn参数是请求到的数量。通过修改pn参数来改变页数。其中gsm参数是pn参数的16进制表达，去掉无妨。%E6%9F%B4%E7%8A%AC为搜索的关键词的转码。

（1）第一页：https://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1581675184369_R&pv=&ic=&nc=1&z=&hd=&latest=&copyright=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&sid=&word=%E6%9F%B4%E7%8A%AC
（2）第二页：https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%E6%9F%B4%E7%8A%AC&pn=20&gsm=3c&ct=&ic=0&lm=-1&width=0&height=0
（3）第三页：https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%E6%9F%B4%E7%8A%AC&pn=40&gsm=50&ct=&ic=0&lm=-1&width=0&height=0

因此url的格式为：

'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%s&pn=%d' %(name,i*20)#%s=name %d=i*20

(1) %用法
i.整数输出
%o —— oct 八进制
%d —— dec 十进制
%x —— hex 十六进制

print('%o' % 20)  # 24
print('%d' % 20)  # 20
print('%x' % 20)  # 14

ii.浮点数输出
%f ——保留小数点后面六位有效数字

%.3f，保留3位小数位

%e ——保留小数点后面六位有效数字，指数形式输出

%.3e，保留3位小数位，使用科学计数法

%g ——在保证六位有效数字的前提下，使用小数方式，否则使用科学计数法

%.3g，保留3位有效数字，使用小数或科学计数法

print('%f' % 1.11)  # 默认保留6位小数 1.110000
print('%.1f' % 1.11)  # 取1位小数 1.1
print('%e' % 1.11)  # 默认6位小数，用科学计数法 1.110000e+00
print('%.3e' % 1.11)  # 取3位小数，用科学计数法 1.110e+00
print('%g' % 1111.1111)  # 默认6位有效数字 1111.11
print('%.7g' % 1111.1111)  # 取7位有效数字 1111.111
print('%.2g' % 1111.1111)  # 取2位有效数字，自动转换为科学计数法 1.1e+03

iii.字符串输出
%s
%10s——右对齐，占位符10位
%-10s——左对齐，占位符10位
%.2s——截取2位字符串
%10.2s——10位占位符，截取两位字符串

print('%s' % 'hello world')  # 字符串输出  hello world
print('%20s' % 'hello world')  # 右对齐，取20位，不够则补位           hello world
print('%-20s' % 'hello world')  # 左对齐，取20位，不够则补位 hello world         
print('%.2s' % 'hello world')  # 取2位 he
print('%10.2s' % 'hello world')  # 右对齐，取2位         he
print('%-10.2s' % 'hello world')  # 左对齐，取2位 he

获取图片地址（objURL）

在网页源代码中使用搜索objURL观察代码再配合正则表达式可得：

results = re.findall(r'\"objURL\":\"(.*?)\", html)

（1）hoverURL 是鼠标移动过后显示的版本
（2）humbURL，middleURL是图片缩小的版本
（3）objURL是原图

代码框架

获取图片url代码

def getperpage(pn,name,num):#pn指爬取页数，name指搜索关键词，num=爬取的数量
    for i in range(int(pn)):
        print('正在获取第{}页'.format(i + 1))
        url=url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%s&pn=%d' %(name,i*20)#%s=name %d=i*20
        headers = {
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36',
            'Sec-Fetch-Dest': 'document',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Sec-Fetch-Site': 'same-origin',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-User': '?1',
            'Referer': 'http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=%E7%99%BE%E5%BA%A6',
            'Accept-Language': 'zh-CN,zh;q=0.9',
        }
        response=requests.get(url,headers=headers)
        html=response.content.decode()#区分与response.txt的差别
        results=re.findall(r'\"objURL\":\"(.*?)\"', html)#(.*?)中的括号不能丢失
        #正则表达式解析："objURL":"http://img.jk51.com/img_jk51/384725631.jpeg" 建议使用网页源代码用CTRL+F查询来观察，用F12的难以观察objURL的格式
        save_to_txt(results,name,i,num)

保存图片到本地

def save_to_txt(results,name,i,num):
   count=0
   j=1
   root='D:\data\study\python\exercise\picture//'+name
   if not os.path.exists(root):#建立文件路径
       os.mkdir(root)
   for result in results:
       count+=1
       if count<=num:#设置控制数量程序
           print('正在保存第{}个'.format(j))
           try:
               pic = requests.get(result, timeout=10)
               time.sleep(1)
           except:
               print('当前图片无法下载')
               j += 1
               continue
           path = root + '/' + name + str(i + 1) + '-' + str(j) + '.jpg'  # 图片路径及文件命名
           with open(path, 'wb')as f:
               f.write(pic.content)
               f.close()
           j += 1

主函数代码

def main():
    name=input('请输入想要搜索的关键词：')
    pn=eval(input('请输入爬取的页数：'))
    num=eval(input('请输入爬取的图片数量：'))
    getperpage(pn,name,num)
main()

引入库

import requests
import re
import os
import time

源代码

import requests
import re
import os
import time
def getperpage(pn,name,num):#pn指爬取页数，name指搜索关键词，num=爬取的数量
    for i in range(int(pn)):
        print('正在获取第{}页'.format(i + 1))
        url=url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%s&pn=%d' %(name,i*20)#%s=name %d=i*20
        headers = {
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36',
            'Sec-Fetch-Dest': 'document',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Sec-Fetch-Site': 'same-origin',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-User': '?1',
            'Referer': 'http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=%E7%99%BE%E5%BA%A6',
            'Accept-Language': 'zh-CN,zh;q=0.9',
        }
        response=requests.get(url,headers=headers)
        html=response.content.decode()#区分与response.txt的差别
        results=re.findall(r'\"objURL\":\"(.*?)\"', html)#(.*?)中的括号不能丢失
        #正则表达式解析："objURL":"http://img.jk51.com/img_jk51/384725631.jpeg" 建议使用网页源代码用CTRL+F查询来观察，用F12的难以观察objURL的格式
        save_to_txt(results,name,i,num)

def save_to_txt(results,name,i,num):
    count=0
    j=1
    root='D:\data\study\python\exercise\picture//'+name
    if not os.path.exists(root):#建立文件路径
        os.mkdir(root)
    for result in results:
        count+=1
        if count<=num:#设置控制数量程序
            print('正在保存第{}个'.format(j))
            try:
                pic = requests.get(result, timeout=10)
                time.sleep(1)
            except:
                print('当前图片无法下载')
                j += 1
                continue
            path = root + '/' + name + str(i + 1) + '-' + str(j) + '.jpg'  # 图片路径及文件命名
            with open(path, 'wb')as f:
                f.write(pic.content)
                f.close()
            j += 1



def main():
    name=input('请输入想要搜索的关键词：')
    pn=eval(input('请输入爬取的页数：'))
    num=eval(input('请输入爬取的图片数量：'))
    getperpage(pn,name,num)
main()

(二)批量下载百度网站图片

原文：https://www.cnblogs.com/HLBBLOG/p/12308552.html

踩

(1)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)