首页 > 编程语言 > 详细

python爬虫爬取豆瓣电视剧数据

时间:2020-04-20 22:40:29      阅读:71      评论:0      收藏:0      [点我收藏+]

作为一个python小白,在下面的问题中出错:

1.因为豆瓣页面的数据加载涉及到异步加载,所以需要通过浏览器获取到真正的网页链接。

2.将字典转化为DataFrame以后写入.csv文件。DataFrame是一个表单一样的数据结构。

3.从网页获取的json数据的处理。

代码:

import re
import requests
from bs4 import BeautifulSoup
import time
import random
import string
import logging
import json
import jsonpath
import pandas as pd

import pdb

User_Agents = [
    Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50,
    Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50,
    Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0,
    Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1,
    Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11,
    Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11,
    Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11,
]


class DoubanSpider(object):
    ‘‘‘豆瓣爬虫‘‘‘

    def __init__(self):
        # 基本的URL
        self.base_url = https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_ + start={start}
        self.full_url = self.base_url
        self.tv_detailurl = https://movie.douban.com/j/subject_abstract?subject_id=

    def download_tvs(self, offset):
        # offset控制一次下载的量,resp返回的响应体
        self.headers = {User-Agent: random.choice(User_Agents)}
        self.full_url = self.base_url.format(start=offset)
        resp = None
        try:
            resp = requests.get(self.full_url, headers=self.headers)
        except Exception as e:
            print(logging.error(e))
        return resp

    def get_tvs(self, resp):
        # resp响应体
        # movies爬取到的电影信息
        print(get_tvs)
        print(resp)
        tv_urls = []
        if resp:
            if resp.status_code == 200:
                html = resp.text
                unicodestr = json.loads(html)
                tv_list = unicodestr[subjects]
                for item in tv_list:
                    data = re.findall(r[0-9]+, str(item[url]))
                    tv_urls.append(self.tv_detailurl + str(data[0]))
            print(tv_urls)
            return tv_urls
        return None

    def download_detailtvs(self, tv_urls):
        tvs = []
        for item in tv_urls:
            self.headers = {User-Agent: random.choice(User_Agents)}
            resp = requests.get(item, headers=self.headers)
            html = resp.text
            unicodestr = json.loads(html)
            tvs.append(unicodestr[subject])
        return tvs


def main():
    spider = DoubanSpider()
    offset =0
    data = {title:[], types:[], directors:[], actors:[], release_year:[], region:[], star:[], episodes_count:[],rate:[]}
    tv_file = pd.DataFrame(data)
    tv_file.to_csv(res_url.csv,mode=w,index=None)
    while True:
        reps = spider.download_tvs(offset)
        tv_urls = spider.get_tvs(reps)
        tvss = spider.download_detailtvs(tv_urls)
        for tvsss in tvss:
            ‘‘‘
            #pdb.set_trace()
            tvsss=re.sub(r\\u200e,‘‘,tvsss)
            tvsss = re.sub(r\‘, \", tvsss)
            tvsss = re.sub(r\‘, \", tvsss)
            #将short_comment去掉
            tvsss = re.sub(r(\"short_comment\").*(\"directors\"), \"directors\",tvsss)
            #将true,false改为"True","False"
            tvsss = re.sub(rTrue, \"True\", tvsss)
            tvsss = re.sub(rFalse, \"False\", tvsss)
            #给所有的list加上双引号
            print(tvsss)
            #将: [转化为: "[
            tvsss = re.sub(r: \[, : "[, tvsss)
            #jiang ],zhuanhuawei ]",
            tvsss=re.sub(r\],,]",,tvsss)
            # 以上正确
            print(tvsss)
            #将director的内容改为单引号
            r1 = re.findall(r(?<=directors": "\[).*?(?=\]\"), tvsss)
            #正确
            if r1:
                r2 = re.sub(r\", \‘, r1[0])
                r3 = re.sub(r\", \‘, r2)
                tvsss = re.sub(r(?<=directors\": \"\[).*?(?=\]\"), r3, tvsss)
            #zhengque
            #将actors的内容改为单引号
            print(tvsss)
            r1 = re.findall(r(?<=actors\": \"\[).*?(?=\]\"), tvsss)
            print("actors")
            print(r1)
            if r1:
                r2 = re.sub(r\", \‘, r1[0])
                r3 = re.sub(r\", \‘, r2)
                tvsss = re.sub(r(?<=actors\": \"\[)[\s\S]*?(?=\]\"), r3, tvsss)
            #将剧情types改为单引号
            print(tvsss)
            r1 = re.findall(r(?<=types": "\[).*?(?=\]\"), tvsss)
            if r1:
                r2 = re.sub(r\", \‘, r1[0])
                r3 = re.sub(r\", \‘, r2)
                tvsss = re.sub(r(?<=types\": \"\[).*?(?=\]\"), r3, tvsss)
            # 正确
            #将二维的数据转化为一维的
            types=str(tvs[types])
            actor = str(tvs[actors])
            director = str(tvs[directors])
            types=re.sub(r\‘,‘‘,types)
            actor = re.sub(r\‘, ‘‘, actor)
            director = re.sub(r\‘, ‘‘, director)
            types = re.sub(r\‘, ‘‘, types)
            actor= re.sub(r\‘, ‘‘, actor)
            director = re.sub(r\‘, ‘‘, director)
            types=types.strip([])
            actor=actor.strip([])
            director=director.strip([])

            data2={title:tvs[title], types:types, directors:director, actors:actor, release_year:tvs[release_year], region:tvs[region], star:tvs[star], episodes_count:tvs[episodes_count],rate:tvs[rate]}
            print(data2)
            tv_file=pd.DataFrame(data2,index=[0])
            #tv_file = pd.DataFrame(data)
            # pdb.set_trace()
            tv_file.to_csv(res_url.csv, mode=a, index=None,header=None)
        offset += 20
        id = offset
        # 控制访问速速
        time.sleep(10)


if __name__ == __main__:
    main()

技术分享图片

python爬虫爬取豆瓣电视剧数据

原文:https://www.cnblogs.com/yaggy/p/12740739.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!