电影天堂小爬虫(xpath练习)

时间：2020-01-21 23:39:05 阅读：172 评论：0 收藏：0 [点我收藏+]

 1 # _author:   Jolly
 2 # date:  2019/8/28
 3 
 4 import requests
 5 import time
 6 from lxml import etree
 7 
 8 
 9 BASE_DOMAIN = ‘https://dytt8.net‘
10 dytt8_url = ‘https://www.dytt8.net/html/gndy/dyzz/list_23_1.html‘
11 HEADERS = {
12     ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36‘,
13     ‘Referer‘: ‘https://www.dytt8.net/html/gndy/dyzz/list_23_2.html‘
14 }
15 response = requests.get(dytt8_url, headers=HEADERS)
16 html = etree.HTML(response.text)
17 # print(etree.tostring(html, encoding=‘utf-8‘).decode(‘utf-8‘))
18 def get_detail_url():
19     tables_tag = html.xpath(‘//div[@class="co_content8"]//table[@class="tbspan"]‘)
20     # print(tables_tag)
21     for table in tables_tag:
22         detail_url = table.xpath(‘.//a/@href‘)[0]
23         full_detail_url = BASE_DOMAIN + detail_url
24         # print(full_detail_url)
25         time.sleep(0.5)
26         parse_detail_url(full_detail_url)
27 
28 
29 
30 def parse_detail_url(full_detail_url):
31     movie_infomations = {}
32     response = requests.get(full_detail_url, headers=HEADERS)
33     text = response.content.decode(‘gbk‘)
34     # print(response.text)
35     html = etree.HTML(text)
36     div_tag = html.xpath(‘//div[@id="Zoom"]‘)[0]
37     # print(div_tag)
38     thumbnail = div_tag.xpath(‘.//img/@src‘)
39     # print(thumbnail)
40     cover = thumbnail[0]                                         # 封面图
41     movie_infomations[‘cover‘] = cover
42     if len(thumbnail) >= 2:
43         screenshot = thumbnail[1]                                    # 截图
44         movie_infomations[‘screenshot‘] = screenshot
45     else:
46         movie_infomations[‘screenshot‘] = cover
47     text_infomations = div_tag.xpath(‘.//text()‘)                #文本信息
48     # print(text_infomations)
49 
50     def parse_info(info, substituted):
51         return info.replace(substituted, "").strip()
52 
53     for index, info in enumerate(text_infomations):
54         if info.startswith(‘◎译　　名‘):
55             translate_name = info.replace(‘◎译　　名‘, "").strip()
56             movie_infomations[‘translate_name‘] = translate_name
57         elif info.startswith(‘◎年　　代‘):
58             year = info.replace(‘◎年　　代‘, "").strip()
59             movie_infomations[‘year‘] = year
60         elif info.startswith(‘◎产　　地‘):
61             country = parse_info(info, ‘◎产　　地‘)
62             movie_infomations[‘country‘] = country
63         elif info.startswith(‘◎字　　幕‘):
64             language = parse_info(info, ‘◎字　　幕‘)
65             movie_infomations[‘language‘] = language
66         elif info.startswith(‘◎主　　演‘):
67             actors = []
68             actor = info.replace(‘◎主　　演‘, "").strip()
69             actors.append(actor)
70             for i in range(index+1, len(text_infomations)):
71                 actor = text_infomations[i].strip()
72                 if actor.startswith(‘◎‘):
73                     break
74                 actors.append(actor)
75             movie_infomations[‘actors‘] = actors
76 
77         elif info.startswith(‘◎简　　介‘):
78             for i in range(index, len(text_infomations)):
79                 profile = text_infomations[i].strip()
80                 if profile.startswith(‘【下载地址】‘):
81                     break
82                 movie_infomations[‘profile‘] = profile
83         download_url = div_tag.xpath(‘.//a/@href‘)
84 
85 
86 
87     print(movie_infomations)
88 
89 
90 if __name__ == ‘__main__‘:
91     get_detail_url()

View Code

电影天堂小爬虫(xpath练习)

原文：https://www.cnblogs.com/Jolly-hu/p/12227314.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)