首页 > 其他 > 详细

datawhale爬虫task01

时间:2019-08-06 23:33:34      阅读:99      评论:0      收藏:0      [点我收藏+]
#使用requests、正则表达式,爬取豆瓣电影top250排行榜
#要求抓取名次、影片名称、年份、导演等字段。


import requests
import re
import csv
import time
class doubanTop250():

    film_list = []

    #1.发送请求
    def send_request(self,url):
        #1.1添加请求头
        headers= {User-Agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
        #1.3 发送请求
        response = requests.get(url=url,headers=headers)
        print(response.status_code)
        return response

    #2.解析数据
    def parse(self,response):
        data = response.content.decode()
        rank  = re.findall(<em class="">(\d+)</em>, data)
        name = re.findall(<img width="100" alt="(.*) src=, data)
        country =  re.findall(&nbsp;/&nbsp;(.*)&nbsp;/&nbsp;, data)
        director = re.findall(导演:(.*), data)
        score = re.findall(<span class="rating_num" property="v:average">(.*)</span>, data)
        for i in range(0, len(rank)):
            film_dict = {}
            film_dict[rank] = rank[i]
            film_dict[name] = name[i]
            film_dict[country] = country[i]
            film_dict[director] = director[i]
            film_dict[score] = score[i]
            self.film_list.append(film_dict)


    #3.存储数据
    def save_data(self):
        #0.创建开启文件
        csv_file = open(top250.csv, w, encoding=utf-8)
        #1.创建csv写入器
        csv_writer = csv.writer(csv_file)
        #2.写入表头
        csv_writer.writerow(self.film_list[0].keys())
        #3.写入内容
        csv_list = []
        for film in self.film_list:
            film_data = film.values()
            csv_list.append(film_data)
        csv_writer.writerows(csv_list)
        #4.关闭文件
        csv_file.close()

        pass
    #4.运行
    def run(self):
        # 1.1目标url地址
        # 拼接url
        base_url = "https://movie.douban.com/top250?start="
        for i in range(0,225,25):
            final_url = base_url + str(i)
            #1.发送请求,返回response对象
            response = self.send_request(final_url)

            #2.解析response数据
            self.parse(response)
            time.sleep(5)
        #3.保存数据
        self.save_data()



doubanTop250().run()

 

datawhale爬虫task01

原文:https://www.cnblogs.com/tommyngx/p/11312172.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!