爬取博客园的所有随笔的url以及计数,还有对应标题

时间：2019-06-18 17:21:01 阅读：133 评论：0 收藏：0 [点我收藏+]

1.爬取博客园的所有随笔的url以及计数,还有对应标题

import re
import requests
#https://www.cnblogs.com/pythonywy/default.html?page=2

#返回内容,随笔数量,随笔名称
def func(url):
    lis = []
    count = 1
    while True:
        count_1 = len(lis)
        response = requests.get(f'{url}default.html?page={count}')
        response = response.text
        data_1 = re.findall(' href="(.*?)"', response, re.S)
        for a in data_1:  # type:str
            if a.startswith('http'):
                if a.endswith('html'):
                    if 'archive' not in a:
                        lis.append(a)
        count +=1
        lis = set(lis)
        lis = list(lis)
        count_2 = len(lis)
        if count_1 == count_2:
            return count_2,lis

#获取标题
def func_2(url):
    response = requests.get('url')
    response = response.text
    name = re.findall('<title>(.*?)</title>',response)
    print(name)

#两个函数连成一起弄成字典形式输出

def func_1_deco(func_1):
    def wrapper(*args,**kwargs):
        dic = dict()
        lis = func_1(*args,**kwargs)
        print(lis)
        count = lis[0]
        url_lis = lis[1]
        dic['count'] = count
        for url in url_lis:
            response = requests.get(url)
            response = response.text
            name = re.findall('<title>(.*?)</title>', response)
            name = name[0]
            name = name.split(' ')
            name = name[0]
            print(name)
            dic[name] = url
        return dic
    return wrapper

@func_1_deco
def func(url):
    lis = []
    count = 1
    while True:
        count_1 = len(lis)
        response = requests.get(f'{url}default.html?page={count}')
        response = response.text
        data_1 = re.findall(' href="(.*?)"', response, re.S)
        for a in data_1:  # type:str
            if a.startswith('http'):
                if a.endswith('html'):
                    if 'archive' not in a:
                        lis.append(a)
        count +=1
        lis = set(lis)
        lis = list(lis)
        count_2 = len(lis)

        if count_1 == count_2:
            return count_2,lis
func('博客园的地址') #注意结尾要有/,字典格式是有一栏计数,其他均为标题+对应的url

原文：https://www.cnblogs.com/pythonywy/p/11046302.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)