# 模拟登陆: 重写start_requests
import scrapy
class RenrenSpider(scrapy.Spider):
def start_requests(self):
url = "http://www.renren.com/PLogin.do"
data = {"email":"xxx@qq.cm",‘password‘:‘xxx‘}
request = scrapy.FormRequest(url,formdata=data,callback=self.parse_detail())
yield request
# 登录后才能访问的个人主页
def parse_detail(self,response):
request = scrapy.Request(url=‘http://www.renren.com/xxxxx/profile‘,callback=self.parse)
yield request
# 解析个人主页
def parse(self,response):
with open(‘renren.html‘,‘w‘,encoding=‘utf-8‘) as fp:
fp.write(response.tetx)
模拟登陆人人网
scrapy 模拟登录豆瓣网(图形验证码)
识别验证码平台:
阿里的云市场:
图片识别
获取页面所有图片链接:
img= xx.xpath["//ul/li/a/img/@src"].extract()
map 连接补全1 / 2
for url in urls:
url = response.urljoin(url) 自动补全
urls = list(map(lambda url:response.urljoin(url),urls)
import os
image = os.path.join(os.path.dirname(os.path.dirname(__file__)),‘images‘)
if not os.path.exists(image):
os.mkdir(image)
else:
print(‘已存在‘)
import os
from urllib import request
class WxappPipeline(object):
def __init__(self):
self.image = os.path.join(os.path.dirname(os.path.dirname(__file__)), ‘images‘)
if not os.path.exists(self.image):
os.mkdir(self.image)
else:
print(‘已存在‘)
def process_item(self, item, spider):
urls = item[‘urls‘]
title = item[‘title‘]
img_path = os.path.join(self.image,title)
if not os.path.exists(img_path):
os.mkdir(img_path)
for url in urls:
# 同步下载图片慢
img_name = url.split(‘_‘)[-1]
request.urlretrieve(url,os.path.join(img_path,img_name))
return item
测试提取的字段:
cmd : 进入项目目录--》 cd project
测试: scrapy shell url()链接
注意:
使用的python 版本环境一致(项目 + cmd)
如:
导入:
title = response.xpath("//h1[@class=‘ph‘]/text()").extract_first()
title
?
原文:https://www.cnblogs.com/shaozheng/p/12776171.html