首页 > 其他 > 详细

百度贴吧爬虫程序

时间:2017-11-12 20:47:20      阅读:290      评论:0      收藏:0      [点我收藏+]

#coding:utf-8

import requests

import random



class TiebaSpider:

    def __init__(self,tieba_name):

        self.headers = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36‘}

        self.tieba_name = tieba_name

        self.url_temp = "https://tieba.baidu.com/f?kw="+tieba_name+"&ie=utf-8&pn={}"


    def get_url_list(self):

        url_list = [self.url_temp.format(i*50) for i in range(0,30)]

        return url_list


    def parse_url(self,url):

        print(‘正在请求%s‘ % url)

        res = requests.get(url,headers = self.headers)

        return res.content.decode()


    def save_html_str(html_str,page_num):

        print(‘正在保存第%s页.html‘ % page_num)

        file_name = str(page_num)+‘.html‘

        with open(file_name,‘w‘) as f:

            f.write(html_str)

            print(‘保存%s成功‘ % file_name)


    def run(self):

        #1.实现主要逻辑

        url_list = self.get_url_list()

        #2.遍历列表,发送请求,获取响应

        for url in  url_list:

            html_str = self.parse_url(url)

            #3.保存

            page_num = url_list.index(url) + 1

            self.save_html_str(html_str,page_num)


if __name__ == "__main__":

    tieba_name = input(‘请输入要贴吧名:‘)

    tieba = TiebaSpider(tieba_name)

    tieba.run()


本文出自 “梦女孩” 博客,请务必保留此出处http://dreamgirl1314.blog.51cto.com/1159474/1981063

百度贴吧爬虫程序

原文:http://dreamgirl1314.blog.51cto.com/1159474/1981063

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!