import requests from lxml import etree import csv import queue import threading from fake_useragent import UserAgent ua = UserAgent().random headers = { ‘User-Agent‘: ua } q_html = queue.Queue() q_big = queue.Queue() s = requests.session() s.get("https://maoyan.com/", headers=headers) # 发送请求获取HTML def request_get(url): # s = requests.session() # s.get("https://maoyan.com/", headers=headers) # s.get("https://maoyan.com/films", headers=headers) res = s.get(url, headers=headers) # print(res.text) html = etree.HTML(text=res.text) return html # 将HTML添加到队列 def get_video_list(): while 1: if q_big.empty() != True: url = q_big.get() html = request_get(url) video_ulr_list = html.xpath(‘//div[@class="movies-list"]/dl//dd‘) print("video_url_list-->{}".format(video_ulr_list)) for i in video_ulr_list: print("0.0") url_less = i.xpath(‘./div[2]/a/@href‘)[0] url = "https://maoyan.com" + url_less q_html.put(url) print("put的url:{}".format(url)) def get_video_detail(): print("==============") while 1: if q_html.empty() != True: video_ulr = q_html.get() print("get到的url:{}".format(video_ulr)) info = request_get(video_ulr) try: title = info.xpath(‘//div[@class="celeInfo-right clearfix"]/div/h1/text()‘)[0] pub_time = info.xpath(‘/html/body/div[3]/div/div[2]/div[1]/ul/li[3]/text()‘)[0] star = info.xpath(‘//*[@id="app"]/div/div[1]/div/div[3]/div[1]/div[2]/div[2]/div/div[1]/ul/li/div/a/text()‘)[0] star = star.strip() star = star.replace(",", "") print(title, pub_time, star) except IndexError: pass fp = open(‘maoyan.csv‘, ‘a‘, encoding=‘utf-8‘) # newline 换行符 writer = csv.writer(fp) writer.writerow([title, pub_time, star]) fp.close() def main(): fp = open(‘maoyan.csv‘, ‘a‘, encoding=‘utf-8‘) # newline 换行符 writer = csv.writer(fp) writer.writerow([‘title‘, ‘pub_time‘, ‘star‘]) fp.close() # 构造url请求 urls = [‘https://maoyan.com/films?showType=3&offset={}‘.format(str(i)) for i in range(0, 500, 30)] print(urls) t_list = [] for url in urls: q_big.put(url) t1 = threading.Thread(target=get_video_list) # 获取电影的url列表 t_list.append(t1) t2 = threading.Thread(target=get_video_detail) # 获取url详情页 t_list.append(t2) for i in t_list: i.start() for i in t_list: i.join() if __name__ == ‘__main__‘: main()
原文:https://www.cnblogs.com/yzg-14/p/12199458.html