首页 > 其他 > 详细

电影天堂小爬虫(xpath练习)

时间:2020-01-21 23:39:05      阅读:170      评论:0      收藏:0      [点我收藏+]
技术分享图片
 1 # _author:   Jolly
 2 # date:  2019/8/28
 3 
 4 import requests
 5 import time
 6 from lxml import etree
 7 
 8 
 9 BASE_DOMAIN = https://dytt8.net
10 dytt8_url = https://www.dytt8.net/html/gndy/dyzz/list_23_1.html
11 HEADERS = {
12     User-Agent: Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36,
13     Referer: https://www.dytt8.net/html/gndy/dyzz/list_23_2.html
14 }
15 response = requests.get(dytt8_url, headers=HEADERS)
16 html = etree.HTML(response.text)
17 # print(etree.tostring(html, encoding=‘utf-8‘).decode(‘utf-8‘))
18 def get_detail_url():
19     tables_tag = html.xpath(//div[@class="co_content8"]//table[@class="tbspan"])
20     # print(tables_tag)
21     for table in tables_tag:
22         detail_url = table.xpath(.//a/@href)[0]
23         full_detail_url = BASE_DOMAIN + detail_url
24         # print(full_detail_url)
25         time.sleep(0.5)
26         parse_detail_url(full_detail_url)
27 
28 
29 
30 def parse_detail_url(full_detail_url):
31     movie_infomations = {}
32     response = requests.get(full_detail_url, headers=HEADERS)
33     text = response.content.decode(gbk)
34     # print(response.text)
35     html = etree.HTML(text)
36     div_tag = html.xpath(//div[@id="Zoom"])[0]
37     # print(div_tag)
38     thumbnail = div_tag.xpath(.//img/@src)
39     # print(thumbnail)
40     cover = thumbnail[0]                                         # 封面图
41     movie_infomations[cover] = cover
42     if len(thumbnail) >= 2:
43         screenshot = thumbnail[1]                                    # 截图
44         movie_infomations[screenshot] = screenshot
45     else:
46         movie_infomations[screenshot] = cover
47     text_infomations = div_tag.xpath(.//text())                #文本信息
48     # print(text_infomations)
49 
50     def parse_info(info, substituted):
51         return info.replace(substituted, "").strip()
52 
53     for index, info in enumerate(text_infomations):
54         if info.startswith(◎译  名):
55             translate_name = info.replace(◎译  名, "").strip()
56             movie_infomations[translate_name] = translate_name
57         elif info.startswith(◎年  代):
58             year = info.replace(◎年  代, "").strip()
59             movie_infomations[year] = year
60         elif info.startswith(◎产  地):
61             country = parse_info(info, ◎产  地)
62             movie_infomations[country] = country
63         elif info.startswith(◎字  幕):
64             language = parse_info(info, ◎字  幕)
65             movie_infomations[language] = language
66         elif info.startswith(◎主  演):
67             actors = []
68             actor = info.replace(◎主  演, "").strip()
69             actors.append(actor)
70             for i in range(index+1, len(text_infomations)):
71                 actor = text_infomations[i].strip()
72                 if actor.startswith():
73                     break
74                 actors.append(actor)
75             movie_infomations[actors] = actors
76 
77         elif info.startswith(◎简  介):
78             for i in range(index, len(text_infomations)):
79                 profile = text_infomations[i].strip()
80                 if profile.startswith(【下载地址】):
81                     break
82                 movie_infomations[profile] = profile
83         download_url = div_tag.xpath(.//a/@href)
84 
85 
86 
87     print(movie_infomations)
88 
89 
90 if __name__ == __main__:
91     get_detail_url()
View Code

电影天堂小爬虫(xpath练习)

原文:https://www.cnblogs.com/Jolly-hu/p/12227314.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!