#!/usr/bin/python
#coding=utf8
from bs4 import BeautifulSoup
import json
import requests
import itchat
from itchat.content import *
def page(url):
return BeautifulSoup(requests.get(url).text,‘lxml‘)
#from ipdb import set_trace
#set_trace()
def got_hot_article(items):
for item in items:
if item.h2.text == ‘热门文章‘:
return item.ul.find_all(‘li‘)
def parse(bs_data:BeautifulSoup):
return got_hot_article(bs_data.find_all(‘div‘,attrs={‘class‘:‘mps‘}))
def format_data(items):
result = []
for item in items:
info = {}
info[‘title‘] = item.a.text
info[‘url‘] = ‘http://www.ftchinese.com‘ + item.a.get(‘href‘)
result.append(info)
return json.dumps(result, indent=4, ensure_ascii=False)
def main():
print(format_data(parse(page(‘http://www.ftchinese.com‘))))
try:
if __name__ == ‘__main__‘:
main()
except HttpError as e:
print(e)
原文:http://yanruohan.blog.51cto.com/9740053/1912139