//div[@class="j-r-list-c-desc"]/a/text()
put()
get()
Queue.empty()
:是否为空Queue.join()
:如果队列为空,执行其他程序threading.Thread(target=...)
import requests
from lxml import etree
from queue import Queue
import threading
import time
class BsSpider:
def __init__(self):
self.baseurl = "http://www.budejie.com/"
self.headers = {"User_Agent": "Mozilla/5.0"}
self.urlQueue = Queue() # url队列
self.resQueue = Queue() # 响应队列
# 生成URL队列
def get_url(self):
for num in range(1, 51):
url = self.baseurl + str(num) # 1是第一页
self.urlQueue.put(url)
# 响应队列
def get_html(self):
while True:
url = self.urlQueue.get()
res = requests.get(url, headers=self.headers)
res.encoding = 'utf-8'
html = res.text
# 放到响应队列
self.resQueue.put(html)
# 清除此任务
self.urlQueue.task_done()
# 解析页面
def get_content(self):
while True:
# 从响应队列中一次获取html源码
html = self.resQueue.get()
parse_html = etree.HTML(html)
r_list = parse_html.xpath('//div[@class="j-r-list-c-desc"]/a/text()')
for r in r_list:
print(r + "\n")
# 清除任务
self.resQueue.task_done()
def main(self):
# 存放所有的线程
thread_list = []
# 获取url队列
self.get_url()
# 创建getpage线程
for i in range(3):
thread_res = threading.Thread(target=self.get_html)
thread_list.append(thread_res)
for i in range(2):
thread_parse = threading.Thread(target=self.get_content)
thread_list.append(thread_parse)
# 所有线程开始干活
for th in thread_list:
th.setDaemon(True)
th.start()
# 如果队列为空,则执行其他程序
self.urlQueue.join()
self.resQueue.join()
print("运行结束")
if __name__ == '__main__':
begin = time.time()
spider = BsSpider()
spider.main()
end = time.time()
print("运行时间:", end - begin)
python -m pip install beautifulsoup4
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,‘lxml‘)
soup.find_all(name="属性值")
soup.节点名
:soup.a、soup.ul
soup.节点名.string
find_all()
:返回列表
r_list = soup.find_all(属性名="属性值")
r_list = soup.find_all(class="test")
# 报错尝试使用class_r_list=soup.find_all("节点名",attrs={"名":"值"})
r_list=soup.find_all("div",attrs={"class":"test"}
from bs4 import BeautifulSoup
html = '<div id="text">哈哈</div>'
# 创建解析对象
soup = BeautifulSoup(html, 'lxml')
# 查找节点
r_list = soup.find_all(id="text")
print(r_list)
for r in r_list:
print(r.get_text())
r_list = soup.find_all("div", attrs={'id': "text"})
print(r_list)
####################################
html = '''<div class="test">你好</div>
<div class="test">再见</div>
<div class="test2">
<span>第二次</span>
</div>'''
# class为test的div的文本内容
soup = BeautifulSoup(html, 'lxml')
divs = soup.find_all("div", attrs={"class": "test"})
print(type(divs))
for div in divs:
print(div.string)
print(div.get_text())
# class为test2的div下的span中的文本内容
divs = soup.find_all("div", attrs={"class": "test2"})
for div in divs:
print(div.span.string)
scrapy startproject 项目名
scrapy genspider 文件名 域名
scrapy crawl 爬虫名
# 是否遵守robots协议,该为False
ROBOTSTXT_OBEY = False
# 最大并发量,默认为16个
CONCURRENT_REQUESTS = 32
# 下载延迟时间为3秒
DOWNLOAD_DELAY = 3
# 请求报头
DEFAULT_REQUEST_HEADERS = {
'User-Agent': "Mozilla/5.0",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
# 蜘蛛中间件
SPIDER_MIDDLEWARES = {
'testspider.middlewares.TestspiderSpiderMiddleware': 543,
}
# 下载器中间件
DOWNLOADER_MIDDLEWARES = {
'testspider.middlewares.TestspiderDownloaderMiddleware': 543,
}
# 管道文件
ITEM_PIPELINES = {
'testspider.pipelines.TestspiderPipeline': 300,
}
scrapy startproject baidu
cd baidu/baidu
subl items.py(此示例可不用操作)
cd spiders
scrapy genspider baiduspider baidu.com
subl settings.py
cd spiders
scrapy crawl baiduspider
pycharm运行scrapy项目
生成器
# Fib.py
def fib(n):
a, b, s = 0, 1, 0
while s < n:
a, b = b, a + b
s += 1
yield b
print(fib(5).__next__())
for i in fib(10):
print(i)
1
1
2
3
5
8
13
21
34
55
89
知识点
extract()
:获取选择器对象中的文本内容
response.xpath(‘.../text()‘)
得到选择器对象(节点文本) [<selector ...,data=‘文本内容‘>]
extract()
把选择器对象中的文本取出来 [‘文本内容‘]
start_urls = []
process_item(self,item,spider)
,当然还可以写任何其他函数存入MongoDB数据库
MONGODB_HOST =
MONGODB_PORT =
在settings.py文件中设置你的项目管道
ITEM_PIPELINES = {
"Daomu.pipelines.DaomumongoPipeline":100,
}
self.db.commit()
Csdn项目存到mongodb和mysql
https://careers.tencent.com/search.html?index=1
https://careers.tencent.com/search.html?index=2
//div[@class="search-content"]
.//h4/text()
.//span[2]/text()
.//span[3]/text()
.//span[4]/text()
.//p[2]/text()
- Fiddler(设置抓包)
- 在手机上安装证书
- 手机浏览器打开:http://IP地址:8888 (IP地址是你电脑的IP,8888是Fiddler设置的端口)
- 在页面上下载(FiddlerRoot certificate)
- 下载文件名:FiddlerRoot.cer
0 直接安装
- 设置代理
- 打开手机上已连接的无线, 代理设置 -> 改成 手动
- IP地址:你电脑的IP (ipconfig / ifconfig)
- 端口号:8888
DEFAULT_REQUEST_HEADER={"User-Agent":"",}
user_agents = [‘‘,‘‘,‘‘,‘‘,‘‘]
from scrapy.pipelines.images import ImagesPipeline
http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=0
dont_filter参数
scrapy.Request(url,callback=...,dont_filter=False)
dont_filter参数 :False->自动对URL进行去重
True -> 不会对URL进行去重
DOWNLOADER_MIDDLEWARES={"Jd.middleware.seleniumMiddleware":20}
https://sourceforge.net/projects/tesseract-ocr-alt/files/tesseract-ocr-setup-3.02.02.exe/download
suo apt-get install tesseract-ocr
brew install tesseract
python -m pip install pytesseract
from PIL import Image
原文:https://www.cnblogs.com/haoenwei/p/10888632.html