import re
import requests
#https://www.cnblogs.com/pythonywy/default.html?page=2
#返回内容,随笔数量,随笔名称
def func(url):
lis = []
count = 1
while True:
count_1 = len(lis)
response = requests.get(f'{url}default.html?page={count}')
response = response.text
data_1 = re.findall(' href="(.*?)"', response, re.S)
for a in data_1: # type:str
if a.startswith('http'):
if a.endswith('html'):
if 'archive' not in a:
lis.append(a)
count +=1
lis = set(lis)
lis = list(lis)
count_2 = len(lis)
if count_1 == count_2:
return count_2,lis
#获取标题
def func_2(url):
response = requests.get('url')
response = response.text
name = re.findall('<title>(.*?)</title>',response)
print(name)
#两个函数连成一起弄成字典形式输出
def func_1_deco(func_1):
def wrapper(*args,**kwargs):
dic = dict()
lis = func_1(*args,**kwargs)
print(lis)
count = lis[0]
url_lis = lis[1]
dic['count'] = count
for url in url_lis:
response = requests.get(url)
response = response.text
name = re.findall('<title>(.*?)</title>', response)
name = name[0]
name = name.split(' ')
name = name[0]
print(name)
dic[name] = url
return dic
return wrapper
@func_1_deco
def func(url):
lis = []
count = 1
while True:
count_1 = len(lis)
response = requests.get(f'{url}default.html?page={count}')
response = response.text
data_1 = re.findall(' href="(.*?)"', response, re.S)
for a in data_1: # type:str
if a.startswith('http'):
if a.endswith('html'):
if 'archive' not in a:
lis.append(a)
count +=1
lis = set(lis)
lis = list(lis)
count_2 = len(lis)
if count_1 == count_2:
return count_2,lis
func('博客园的地址') #注意结尾要有/,字典格式是有一栏计数,其他均为标题+对应的url
原文:https://www.cnblogs.com/pythonywy/p/11046302.html