不同磁力链网站网页内容都不同,需要定制
1,并发爬取
并发爬取后,好像一会就被封了
import requests from lxml import etree import re from concurrent.futures import ThreadPoolExecutor def get_mlink(url, headers): """输入某影片磁力链所在的网页,返回该网页中的磁力链""" r = requests.get(url, headers=headers) select = etree.HTML(r.text) try: magnetlink = select.xpath(‘//textarea[@id="magnetLink"]//text()‘) return magnetlink[0] except AttributeError: return None def get_page_mlinks(url, headers): """输入某一页搜索结果,返回该网页中所有的元组(url, 影片大小,时间,磁力链)""" r = requests.get(url, headers=headers) select = etree.HTML(r.text) div_rows = select.xpath(‘//div[@class="row"]‘) def get_each(se): size = se.xpath(‘.//div[@class="col-sm-2 col-lg-1 hidden-xs text-right size"]//text()‘) date = se.xpath(‘.//div[@class="col-sm-2 col-lg-2 hidden-xs text-right date"]//text()‘) href = se.xpath(‘.//a/@href‘) try: return href[0], size[0], date[0], get_mlink(href[0], headers) except IndexError: pass with ThreadPoolExecutor() as executor: # 并发执行爬取单个网页中所有的磁力链 res = executor.map(get_each, div_rows) return res def get_urls(baseurl, headers, suffix=None): """输入搜索网页,递归获取所有页的搜索结果""" if suffix: url = baseurl + suffix else: url = baseurl r = requests.get(url, headers=headers) select = etree.HTML(r.text) page_suffixes = select.xpath(‘//ul[@class="pagination pagination-lg"]‘ ‘//li//a[@name="numbar"]/@href‘) # 有时该站会返回/search/.../search/...search/.../page,需要处理下 p = r‘/search/[^/]+/page/\d+(?=\D|$)‘ page_suffixes = [re.search(p, i).group() for i in page_suffixes] # 如果还有下一页,需要进一步递归查询获取 r = requests.get(url + page_suffixes[-1], headers=headers) select = etree.HTML(r.text) next_page = select.xpath(‘//ul[@class="pagination pagination-lg"]‘ ‘//li//a[@name="nextpage"]/@href‘) if next_page: page_suffixes = page_suffixes + get_urls(baseurl, headers, next_page[0]) return page_suffixes if __name__ == ‘__main__‘: keyword = "金刚狼3" baseurl = ‘https://*网站url*/search/{}‘.format(keyword) # 该站是采用get方式提交搜索关键词 headers = {"Accept-Language": "en-US,en;q=0.8,zh-TW;q=0.6,zh;q=0.4"} urls = get_urls(baseurl, headers) new_urls = list(set(urls)) new_urls.sort(key=urls.index) new_urls = [baseurl + i for i in new_urls] with ThreadPoolExecutor() as executor: res = executor.map(get_page_mlinks, new_urls, [headers for i in range(7)]) for r in res: for i in r: print(i)
2,逐页爬取
手工输入关键词和页数
超过网站已有页数时,返回None
爬取单个搜索页中所有磁力链时,仍然用的是并发
import requests from lxml import etree from concurrent.futures import ThreadPoolExecutor def get_mlink(url, headers): """输入某影片磁力链所在的网页,返回该网页中的磁力链""" r = requests.get(url, headers=headers) select = etree.HTML(r.text) try: magnetlink = select.xpath(‘//textarea[@id="magnetLink"]//text()‘) return magnetlink[0] except AttributeError: return None def get_page_mlinks(url, headers): """输入某一页搜索结果,返回该网页中所有的元组(url, 影片大小,时间,磁力链)""" r = requests.get(url, headers=headers) select = etree.HTML(r.text) div_rows = select.xpath(‘//div[@class="row"]‘) def get_each(se): size = se.xpath(‘.//div[@class="col-sm-2 col-lg-1 hidden-xs text-right size"]//text()‘) date = se.xpath(‘.//div[@class="col-sm-2 col-lg-2 hidden-xs text-right date"]//text()‘) href = se.xpath(‘.//a/@href‘) try: return href[0], size[0], date[0], get_mlink(href[0], headers) except IndexError: pass with ThreadPoolExecutor() as executor: # 并发执行爬取单个网页中所有的磁力链 res = executor.map(get_each, div_rows) return res if __name__ == ‘__main__‘: keyword = input(‘请输入查找关键词>> ‘) page = input(‘请输入查找页>> ‘) url = ‘https://btsow.pw/search/{}/page/{}‘.format(keyword, page) headers = {"Accept-Language": "en-US,en;q=0.8,zh-TW;q=0.6,zh;q=0.4"} r = get_page_mlinks(url, headers) for i in r: print(i)
原文:https://www.cnblogs.com/guxh/p/10702472.html