from urllib import request,parse from bs4 import BeautifulSoup class CatEye(): def __init__(self): self.url=‘https://maoyan.com/board/4?offset={}‘ # https://maoyan.com/board/4?offset=10 self.headers = { ‘User-Agent‘:‘Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)‘ } #获取url def get_url(self,page_n): params=page_n*10 full_url=self.url.format(params) return full_url #获取html def get_html(self,url): req=request.Request(url,headers=self.headers) response=request.urlopen(req) html=response.read().decode() return html #解析 def get_info(self,html): soup=BeautifulSoup(html) movie_list_soup=soup.find("dl",attrs={‘class‘:"board-wrapper"}) movie_name_list=[] for dd in movie_list_soup.find_all(‘dd‘): movie_name_tmp=dd.find("p",attrs={‘class‘:‘name‘}) movie_name=movie_name_tmp.find("a").getText().strip() actors=dd.find("p",attrs={‘class‘:‘star‘}).getText().strip() time=dd.find("p",attrs={‘class‘:‘releasetime‘}).getText().strip() movie_info=movie_name+‘ ‘+actors+‘ ‘+time movie_name_list.append(movie_info) return movie_name_list #存储 def save_info(self,filename,info): with open(filename,‘w‘) as f: f.write(info) def runforever(self): info=‘‘ for i in range(10): url=self.get_url(i) html=self.get_html(url) movie_list=self.get_info(html) info+=‘\r\n‘.join(movie_list) info+=‘\r\n‘ filename=‘CatEye_Board‘ self.save_info(filename,info) if __name__ == ‘__main__‘: spider=CatEye() spider.runforever()
猫眼电影top100链接:
基本思路和豆瓣TOP250是一样的,需要用到request和beautifulsoup模块。输出有点丑不要介意哈。
原文:https://www.cnblogs.com/jiaqi77/p/12167293.html