'''
1.爬取豆瓣top250电影信息
- 第一页: https://movie.douban.com/top250?start=0&filter=
- 第二页: https://movie.douban.com/top250?start=25&filter=
- 第三页: https://movie.douban.com/top250?start=50&filter=
- 第十页: https://movie.douban.com/top250?start=225&filter=
- 爬取步骤:
- 1) 获取所有电影的主页url
- 2) 往每一个主页发送请求,获取响应数据
- 3) 解析并提取想要的数据(获取每一部电影的class为item的div)
- 4) 根据每一部电影的div提取电影的: 详情页url、电影名字、电影评分、评价人数
'''
import requests
import re
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
}
# 爬虫三部曲
# 1.发送请求
def get_html(url):
response = requests.get(url, headers=headers)
return response
# 2.解析数据
def parse_html(response):
movie_data_list = re.findall(
'<div class="item">.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价',
response.text, re.S)
return movie_data_list
# 3.保存数据
def save_data(movie_data_list, num):
# ('https://movie.douban.com/subject/1292052/', '肖申克的救赎', '9.7', '1737867')
url, name, point, commit = movie_data_list
# 处理数据的格式
movie_data = f'''
电影排名:{num}
详情页url:{url}
电影名字:{name}
电影评分:{point}
评价人数:{commit}
'''
print(movie_data)
with open('douban_top250.txt', 'a', encoding='utf-8') as f:
f.write(movie_data)
if __name__ == '__main__':
number = 0
num = 1
for line in range(10):
url = f'https://movie.douban.com/top250?start={number}&filter='
# url = f'https://movie.douban.com/top250?start={line * 25}&filter='
number += 25
# print(url)
index_response = get_html(url)
movie_data_list = parse_html(index_response)
for movie_tuple in movie_data_list:
# ('https://movie.douban.com/subject/1292052/', '肖申克的救赎', '9.7', '1737867')
save_data(movie_tuple, num)
num += 1
原文:https://www.cnblogs.com/xichenHome/p/12153273.html