1 # _author: Jolly 2 # date: 2019/8/28 3 4 import requests 5 import time 6 from lxml import etree 7 8 9 BASE_DOMAIN = ‘https://dytt8.net‘ 10 dytt8_url = ‘https://www.dytt8.net/html/gndy/dyzz/list_23_1.html‘ 11 HEADERS = { 12 ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘, 13 ‘Referer‘: ‘https://www.dytt8.net/html/gndy/dyzz/list_23_2.html‘ 14 } 15 response = requests.get(dytt8_url, headers=HEADERS) 16 html = etree.HTML(response.text) 17 # print(etree.tostring(html, encoding=‘utf-8‘).decode(‘utf-8‘)) 18 def get_detail_url(): 19 tables_tag = html.xpath(‘//div[@class="co_content8"]//table[@class="tbspan"]‘) 20 # print(tables_tag) 21 for table in tables_tag: 22 detail_url = table.xpath(‘.//a/@href‘)[0] 23 full_detail_url = BASE_DOMAIN + detail_url 24 # print(full_detail_url) 25 time.sleep(0.5) 26 parse_detail_url(full_detail_url) 27 28 29 30 def parse_detail_url(full_detail_url): 31 movie_infomations = {} 32 response = requests.get(full_detail_url, headers=HEADERS) 33 text = response.content.decode(‘gbk‘) 34 # print(response.text) 35 html = etree.HTML(text) 36 div_tag = html.xpath(‘//div[@id="Zoom"]‘)[0] 37 # print(div_tag) 38 thumbnail = div_tag.xpath(‘.//img/@src‘) 39 # print(thumbnail) 40 cover = thumbnail[0] # 封面图 41 movie_infomations[‘cover‘] = cover 42 if len(thumbnail) >= 2: 43 screenshot = thumbnail[1] # 截图 44 movie_infomations[‘screenshot‘] = screenshot 45 else: 46 movie_infomations[‘screenshot‘] = cover 47 text_infomations = div_tag.xpath(‘.//text()‘) #文本信息 48 # print(text_infomations) 49 50 def parse_info(info, substituted): 51 return info.replace(substituted, "").strip() 52 53 for index, info in enumerate(text_infomations): 54 if info.startswith(‘◎译 名‘): 55 translate_name = info.replace(‘◎译 名‘, "").strip() 56 movie_infomations[‘translate_name‘] = translate_name 57 elif info.startswith(‘◎年 代‘): 58 year = info.replace(‘◎年 代‘, "").strip() 59 movie_infomations[‘year‘] = year 60 elif info.startswith(‘◎产 地‘): 61 country = parse_info(info, ‘◎产 地‘) 62 movie_infomations[‘country‘] = country 63 elif info.startswith(‘◎字 幕‘): 64 language = parse_info(info, ‘◎字 幕‘) 65 movie_infomations[‘language‘] = language 66 elif info.startswith(‘◎主 演‘): 67 actors = [] 68 actor = info.replace(‘◎主 演‘, "").strip() 69 actors.append(actor) 70 for i in range(index+1, len(text_infomations)): 71 actor = text_infomations[i].strip() 72 if actor.startswith(‘◎‘): 73 break 74 actors.append(actor) 75 movie_infomations[‘actors‘] = actors 76 77 elif info.startswith(‘◎简 介‘): 78 for i in range(index, len(text_infomations)): 79 profile = text_infomations[i].strip() 80 if profile.startswith(‘【下载地址】‘): 81 break 82 movie_infomations[‘profile‘] = profile 83 download_url = div_tag.xpath(‘.//a/@href‘) 84 85 86 87 print(movie_infomations) 88 89 90 if __name__ == ‘__main__‘: 91 get_detail_url()
原文:https://www.cnblogs.com/Jolly-hu/p/12227314.html