Python爬虫学习（二）使用Beautiful Soup库

时间：2020-01-31 00:24:12 阅读：96 评论：0 收藏：0 [点我收藏+]

（一）使用Beautiful Soup库

　　1，安装Beautiful Soup库：pip install beautifulsoup4

　　2，简单使用：

import requests;
from _socket import timeout
from bs4 import BeautifulSoup         #使用Beautiful Soup库需要导包
#from aifc import data
def getHTMLText(url):
    try:
        r=requests.get(url,timeout=30)  
        r.raise_for_status()           #如果连接状态不是200，则引发HTTPError异常
        r.encoding=r.apparent_encoding #使返回的编码正常
        print("连接成功")
        return r.status_code
    except:
        print("连接异常")
        return r.status_code

url="https://python123.io/ws/demo.html"
#keywords={"ip":"202.204.80.112"}
access={"user-agent":"Mozilla/5.0"}              #设置访问网站为浏览器Mozilla5.0
if getHTMLText(url)==200:
    r=requests.get(url, headers=access)        
    #print(r.encoding)
    r.encoding=r.apparent_encoding
    demo=r.text
    soup=BeautifulSoup(demo,"html.parser")      #解析HTML页面，使用html.parser解析器
    print(soup.prettify())                      #打印HTML代码
　 　print(soup.a.attrs)                               #打印出该HTML文件的第一个a标签的属性，获得一个字典型数据；可以根据soup.a.attrs[‘href‘]获取链接
　　print(soup.a.name) 　　　　　　　　　　　　#打印第一个a标签的标签名
   print(soup.a.parent.name)             #打印出第一个a标签的父标签的标签名