wb_id就是微博内容所在的id,html标签属性为mid
#图片区域,多图
self.multi_media_xpath="//div[@mid=‘{}‘ and not(@minfo)]//div[@class=‘WB_detail‘]//div[@class=‘media_box‘]/ul/li/img/@src"
# 图片区域,单图
self.single_media_xpath="//div[@mid=‘{}‘ and not(@minfo)]//div[@class=‘WB_detail‘]//div[@class=‘media_box‘]/ul/li//img/@src"
@decorator def get_img_list(self,root,wb_id): # 判断是否为单图 imgurllist = [] single_img_node_list = root.xpath(self.single_media_xpath.format(wb_id)) # 不包含360长图 multi_img_node_list = root.xpath(self.multi_media_xpath.format(wb_id)) if len(multi_img_node_list) > 1: imgurllist = ["http:" + i.replace("thumb150", "mw690") for i in multi_img_node_list] return imgurllist elif single_img_node_list: #单图的链接形式 imgurllist = ["http:" + i.replace("orj360", "mw690") for i in multi_img_node_list] else: print("该条内容没有图片") return imgurllist def save_imge(self,url,id_path,retry=1): if retry>3: print("重试三次以上,该图片下载失败") return None filepath=id_path urlname=url.split(‘/‘)[-1] filename=os.path.join(filepath,urlname) if not os.path.exists(filepath): os.makedirs(filepath) if not os.path.exists(filename): while retry<3: try: ir=requests.get(url,timeout=10) print("当前下载的url", url, "id", id_path) with open(filename, "wb") as fs: fs.write(ir.content) break except: time.sleep(3) print(f"图片下载超时,开始重试,重试次数",retry) retry+=1 self.save_imge(url,id_path,retry) else: print("图片已经存在")
原文:https://www.cnblogs.com/c-x-a/p/9146192.html