爬取豆瓣电影

时间：2020-06-23 22:59:57 阅读：79 评论：0 收藏：0 [点我收藏+]

import  requests
import  time
from  lxml  import  etree
import  json


#获取网页函数
def  getpage(url):
    try:
        headers={‘User-Agent‘:‘Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Mobile Safari/537.36‘}
        res=requests.get(url,headers=headers)
        if res.status_code==200:
            return res.text
        else:
            return None
    except:
        return None
    
    
#解析网页函数
def  parsepage(html):
    ht=etree.HTML(html)
    items=ht.xpath(‘//div[@class="item"]‘)
    for  item  in  items:
        res={
        ‘title‘:item.xpath(‘.//span[@class="title"]/text()‘),
        ‘index‘:item.xpath(‘.//div[@class="item"]//em/text()‘),
        ‘score‘:item.xpath(‘.//span[@class="rating_num"]/text()‘),
        ‘actor‘:item.xpath(‘.//p[@class=""]/text()‘),
        ‘image‘:item.xpath(‘.//img[@width="100"]/@src‘)
        }
        yield  res
        
    

#写入文件
def  writefile(item):
    with  open(‘豆瓣.json‘,‘a‘,encoding=‘utf-8‘)  as  f:
        print(‘正在写入数据{}...‘.format(item[‘title‘]))
        f.write(json.dumps(item,ensure_ascii=False))
        f.write(‘\n‘)
        
#定义一个主函数   
def  main(offset):
    url=‘https://movie.douban.com/top250?start={}‘.format(offset)
    html=getpage(url)
    print(‘正在解析程序.....‘)
    if  html:
        for  i in  parsepage(html):
            writefile(i)
            
            

if __name__=="__main__":
    for  i in  range(0,250,25):
        main(offset=i)
        time.sleep(2)

爬取豆瓣电影

原文：https://www.cnblogs.com/luckiness/p/13184771.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)