#!/usr/bin/python3 # -*- conding:utf-8 -*- __author__ = ‘mayi‘ from bs4 import BeautifulSoup html = """ <html><head><title>The Dormouse‘s story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse‘s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="" class="sister" id="link1"><!-- Elsie --></a>, <a href="" class="sister" id="link2">Lacie</a> and <a href="" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 创建 Beautiful Soup 对象,指定lxml解析器 soup = BeautifulSoup(html, "lxml") # 格式化输出 soup 对象的内容 print(soup.prettify())
<html> <head> <title> The Dormouse‘s story </title> </head> <body> <p class="title" name="dromouse"> <b> The Dormouse‘s story </b> </p> <p class="story"> Once upon a time there were three little sisters; and their names were <a class="sister" href="" id="link1"> <!-- Elsie --> </a> , <a class="sister" href="" id="link2"> Lacie </a> and <a class="sister" href="" id="link3"> Tillie </a> ; and they lived at the bottom of a well. </p> <p class="story"> ... </p> </body> </html>
Tag 通俗点讲就是HTML中的一个个标签,例如:
<head><title>The Dormouse‘s story</title></head> <a class="sister" href="" id="link1"><!-- Elsie --></a> <p class="title" name="dromouse"><b>The Dormouse‘s story</b></p>
上面title head a p 等等HTML标签加上里面包括的内容就是Tag,那么试着使用BeautifulSoup来获取Tags:
#!/usr/bin/python3 # -*- conding:utf-8 -*- __author__ = ‘mayi‘ from bs4 import BeautifulSoup html = """ <html><head><title>The Dormouse‘s story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse‘s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="" class="sister" id="link1"><!-- Elsie --></a>, <a href="" class="sister" id="link2">Lacie</a> and <a href="" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 创建 Beautiful Soup 对象,指定lxml解析器 soup = BeautifulSoup(html, "lxml") # # 打印title标签 print(soup.title) # 打印head标签 print(soup.head) # 打印a标签 print(soup.a) # 打印p标签 print(soup.p) # 打印soup.p的类型 print(type(soup.p))
<title>The Dormouse‘s story</title> <head><title>The Dormouse‘s story</title></head> <a class="sister" href="" id="link1"><!-- Elsie --></a> <p class="title" name="dromouse"><b>The Dormouse‘s story</b></p> <class ‘bs4.element.Tag‘>
#!/usr/bin/python3 # -*- conding:utf-8 -*- __author__ = ‘mayi‘ from bs4 import BeautifulSoup html = """ <html><head><title>The Dormouse‘s story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse‘s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="" class="sister" id="link1"><!-- Elsie --></a>, <a href="" class="sister" id="link2">Lacie</a> and <a href="" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 创建 Beautiful Soup 对象,指定lxml解析器 soup = BeautifulSoup(html, "lxml") # soup对象比较特殊,它的name为[document] print( # 对于其他内部标签,输出的值便为标签本身的名称 print( # 打印p标签的所有属性,其类型是一个字典 print(soup.p.attrs) # 打印p标签的class属性 print(soup.p[‘class‘]) # 还可以利用get方法获取属性,传入属性的名称,与上面的方法等价 print(soup.p.get(‘class‘)) print(soup.p) # 修改属性 soup.p[‘class‘] = "newClass" print(soup.p) # 删除属性 del soup.p[‘class‘] print(soup.p)
[document] head {‘class‘: [‘title‘], ‘name‘: ‘dromouse‘} [‘title‘] [‘title‘] <p class="title" name="dromouse"><b>The Dormouse‘s story</b></p> <p class="newClass" name="dromouse"><b>The Dormouse‘s story</b></p> <p name="dromouse"><b>The Dormouse‘s story</b></p>
#!/usr/bin/python3 # -*- conding:utf-8 -*- __author__ = ‘mayi‘ from bs4 import BeautifulSoup html = """ <html><head><title>The Dormouse‘s story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse‘s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="" class="sister" id="link1"><!-- Elsie --></a>, <a href="" class="sister" id="link2">Lacie</a> and <a href="" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 创建 Beautiful Soup 对象,指定lxml解析器 soup = BeautifulSoup(html, "lxml") # 打印p标签的内容 print(soup.p.string) # 打印soup.p.string的类型 print(type(soup.p.string))
The Dormouse‘s story <class ‘bs4.element.NavigableString‘>
#!/usr/bin/python3 # -*- conding:utf-8 -*- __author__ = ‘mayi‘ from bs4 import BeautifulSoup html = """ <html><head><title>The Dormouse‘s story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse‘s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="" class="sister" id="link1"><!-- Elsie --></a>, <a href="" class="sister" id="link2">Lacie</a> and <a href="" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 创建 Beautiful Soup 对象,指定lxml解析器 soup = BeautifulSoup(html, "lxml") # 类型 print(type( # 名称 print( # 属性 print(soup.attrs)
<class ‘str‘> [document] {}
#!/usr/bin/python3 # -*- conding:utf-8 -*- __author__ = ‘mayi‘ from bs4 import BeautifulSoup html = """ <html><head><title>The Dormouse‘s story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse‘s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="" class="sister" id="link1"><!-- Elsie --></a>, <a href="" class="sister" id="link2">Lacie</a> and <a href="" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 创建 Beautiful Soup 对象,指定lxml解析器 soup = BeautifulSoup(html, "lxml") print(soup.a) print(soup.a.string) print(type(soup.a.string))
<a class="sister" href="" id="link1"><!-- Elsie --></a> Elsie <class ‘bs4.element.Comment‘>
#!/usr/bin/python3 # -*- conding:utf-8 -*- __author__ = ‘mayi‘ from bs4 import BeautifulSoup html = """ <html><head><title>The Dormouse‘s story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse‘s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="" class="sister" id="link1"><!-- Elsie --></a>, <a href="" class="sister" id="link2">Lacie</a> and <a href="" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 创建 Beautiful Soup 对象,指定lxml解析器 soup = BeautifulSoup(html, "lxml") # 输出方式为列表 print(soup.head.contents) print(soup.head.contents[0])
[<title>The Dormouse‘s story</title>] <title>The Dormouse‘s story</title>
#!/usr/bin/python3 # -*- conding:utf-8 -*- __author__ = ‘mayi‘ from bs4 import BeautifulSoup html = """ <html><head><title>The Dormouse‘s story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse‘s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="" class="sister" id="link1"><!-- Elsie --></a>, <a href="" class="sister" id="link2">Lacie</a> and <a href="" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 创建 Beautiful Soup 对象,指定lxml解析器 soup = BeautifulSoup(html, "lxml") # 输出方式为列表生成器对象 print(soup.head.children) # 通过遍历获取所有子节点 for child in soup.head.children: print(child)
<list_iterator object at 0x008FF950> <title>The Dormouse‘s story</title>
#!/usr/bin/python3 # -*- conding:utf-8 -*- __author__ = ‘mayi‘ from bs4 import BeautifulSoup html = """ <html><head><title>The Dormouse‘s story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse‘s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="" class="sister" id="link1"><!-- Elsie --></a>, <a href="" class="sister" id="link2">Lacie</a> and <a href="" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 创建 Beautiful Soup 对象,指定lxml解析器 soup = BeautifulSoup(html, "lxml") # 输出方式为列表生成器对象 print(soup.head.descendants) # 通过遍历获取所有子孙节点 for child in soup.head.descendants: print(child)
<generator object descendants at 0x00519AB0> <title>The Dormouse‘s story</title> The Dormouse‘s story
#!/usr/bin/python3 # -*- conding:utf-8 -*- __author__ = ‘mayi‘ from bs4 import BeautifulSoup html = """ <html><head><title>The Dormouse‘s story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse‘s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="" class="sister" id="link1"><!-- Elsie --></a>, <a href="" class="sister" id="link2">Lacie</a> and <a href="" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 创建 Beautiful Soup 对象,指定lxml解析器 soup = BeautifulSoup(html, "lxml") print(soup.head.string) print(soup.head.title.string)
The Dormouse‘s story The Dormouse‘s story
最简单的过滤器就是字符串,在搜索方法中传入一个字符串参数,Beautiful Soup会查找与字符串完整匹配所有的内容,返回一个列表。
#!/usr/bin/python3 # -*- conding:utf-8 -*- __author__ = ‘mayi‘ from bs4 import BeautifulSoup html = """ <html><head><title>The Dormouse‘s story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse‘s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="" class="sister" id="link1"><!-- Elsie --></a>, <a href="" class="sister" id="link2">Lacie</a> and <a href="" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 创建 Beautiful Soup 对象,指定lxml解析器 soup = BeautifulSoup(html, "lxml") print(soup.find_all("b")) print(soup.find_all("a"))
[<b>The Dormouse‘s story</b>] [<a class="sister" href="" id="link1"><!-- Elsie --></a>, <a class="sister" href="" id="link2">Lacie</a>, <a class="sister" href="" id="link3">Tillie</a>]
如果传入正则表达式作为参数,Beautiful Soup会通过正则表达式match()来匹配内容
#!/usr/bin/python3 # -*- conding:utf-8 -*- __author__ = ‘mayi‘ from bs4 import BeautifulSoup import re html = """ <html><head><title>The Dormouse‘s story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse‘s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="" class="sister" id="link1"><!-- Elsie --></a>, <a href="" class="sister" id="link2">Lacie</a> and <a href="" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 创建 Beautiful Soup 对象,指定lxml解析器 soup = BeautifulSoup(html, "lxml") for tag in soup.find_all(re.compile("^b")): print(
body b
如果传入列表参数,Beautiful Soup会将与列表中任一元素匹配的内容以列表方式返回
#!/usr/bin/python3 # -*- conding:utf-8 -*- __author__ = ‘mayi‘ from bs4 import BeautifulSoup html = """ <html><head><title>The Dormouse‘s story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse‘s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="" class="sister" id="link1"><!-- Elsie --></a>, <a href="" class="sister" id="link2">Lacie</a> and <a href="" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 创建 Beautiful Soup 对象,指定lxml解析器 soup = BeautifulSoup(html, "lxml") print(soup.find_all([‘a‘, ‘b‘]))
#!/usr/bin/python3 # -*- conding:utf-8 -*- __author__ = ‘mayi‘ from bs4 import BeautifulSoup html = """ <html><head><title>The Dormouse‘s story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse‘s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="" class="sister" id="link1"><!-- Elsie --></a>, <a href="" class="sister" id="link2">Lacie</a> and <a href="" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 创建 Beautiful Soup 对象,指定lxml解析器 soup = BeautifulSoup(html, "lxml") print(soup.find_all(id="link1"))
[<a class="sister" href="" id="link1"><!-- Elsie --></a>]
#!/usr/bin/python3 # -*- conding:utf-8 -*- __author__ = ‘mayi‘ from bs4 import BeautifulSoup import re html = """ <html><head><title>The Dormouse‘s story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse‘s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="" class="sister" id="link1"><!-- Elsie --></a>, <a href="" class="sister" id="link2">Lacie</a> and <a href="" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 创建 Beautiful Soup 对象,指定lxml解析器 soup = BeautifulSoup(html, "lxml") # 字符串 print(soup.find_all(text = " Elsie ")) # 列表 print(soup.find_all(text = ["Tillie", " Elsie ", "Lacie"])) # 正则表达式 print(soup.find_all(text = re.compile("Dormouse")))
[‘ Elsie ‘] [‘ Elsie ‘, ‘Lacie‘, ‘Tillie‘] ["The Dormouse‘s story", "The Dormouse‘s story"]
#!/usr/bin/python3 # -*- conding:utf-8 -*- __author__ = ‘mayi‘ from bs4 import BeautifulSoup html = """ <html><head><title>The Dormouse‘s story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse‘s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="" class="sister" id="link1"><!-- Elsie --></a>, <a href="" class="sister" id="link2">Lacie</a> and <a href="" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 创建 Beautiful Soup 对象,指定lxml解析器 soup = BeautifulSoup(html, "lxml") print("title")) print("b")) print("a"))
[<title>The Dormouse‘s story</title>] [<b>The Dormouse‘s story</b>] [<a class="sister" href="" id="link1"><!-- Elsie --></a>, <a class="sister" href="" id="link2">Lacie</a>, <a class="sister" href="" id="link3">Tillie</a>]
#!/usr/bin/python3 # -*- conding:utf-8 -*- __author__ = ‘mayi‘ from bs4 import BeautifulSoup html = """ <html><head><title>The Dormouse‘s story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse‘s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="" class="sister" id="link1"><!-- Elsie --></a>, <a href="" class="sister" id="link2">Lacie</a> and <a href="" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 创建 Beautiful Soup 对象,指定lxml解析器 soup = BeautifulSoup(html, "lxml") print(".title"))
[<p class="title" name="dromouse"><b>The Dormouse‘s story</b></p>]
#!/usr/bin/python3 # -*- conding:utf-8 -*- __author__ = ‘mayi‘ from bs4 import BeautifulSoup html = """ <html><head><title>The Dormouse‘s story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse‘s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="" class="sister" id="link1"><!-- Elsie --></a>, <a href="" class="sister" id="link2">Lacie</a> and <a href="" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 创建 Beautiful Soup 对象,指定lxml解析器 soup = BeautifulSoup(html, "lxml") print("#link1"))
[<p class="title" name="dromouse"><b>The Dormouse‘s story</b></p>]
#!/usr/bin/python3 # -*- conding:utf-8 -*- __author__ = ‘mayi‘ from bs4 import BeautifulSoup html = """ <html><head><title>The Dormouse‘s story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse‘s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="" class="sister" id="link1"><!-- Elsie --></a>, <a href="" class="sister" id="link2">Lacie</a> and <a href="" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 创建 Beautiful Soup 对象,指定lxml解析器 soup = BeautifulSoup(html, "lxml") print("p #link1"))
[<a class="sister" href="" id="link1"><!-- Elsie --></a>]
#!/usr/bin/python3 # -*- conding:utf-8 -*- __author__ = ‘mayi‘ from bs4 import BeautifulSoup html = """ <html><head><title>The Dormouse‘s story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse‘s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="" class="sister" id="link1"><!-- Elsie --></a>, <a href="" class="sister" id="link2">Lacie</a> and <a href="" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 创建 Beautiful Soup 对象,指定lxml解析器 soup = BeautifulSoup(html, "lxml") print("a[class=‘sister‘]"))
[<a class="sister" href="" id="link1"><!-- Elsie --></a>, <a class="sister" href="" id="link2">Lacie</a>, <a class="sister" href="" id="link3">Tillie</a>]
#!/usr/bin/python3 # -*- conding:utf-8 -*- __author__ = ‘mayi‘ from bs4 import BeautifulSoup html = """ <html><head><title>The Dormouse‘s story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse‘s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="" class="sister" id="link1"><!-- Elsie --></a>, <a href="" class="sister" id="link2">Lacie</a> and <a href="" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 创建 Beautiful Soup 对象,指定lxml解析器 soup = BeautifulSoup(html, "lxml") print("p a[class=‘sister‘]"))
[<a class="sister" href="" id="link1"><!-- Elsie --></a>, <a class="sister" href="" id="link2">Lacie</a>, <a class="sister" href="" id="link3">Tillie</a>]
#!/usr/bin/python3 # -*- conding:utf-8 -*- __author__ = ‘mayi‘ from bs4 import BeautifulSoup html = """ <html><head><title>The Dormouse‘s story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse‘s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="" class="sister" id="link1"><!-- Elsie --></a>, <a href="" class="sister" id="link2">Lacie</a> and <a href="" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 创建 Beautiful Soup 对象,指定lxml解析器 soup = BeautifulSoup(html, "lxml") print("p a[class=‘sister‘]")) for item in"p a[class=‘sister‘]"): print(item.get_text())
[<a class="sister" href="" id="link1"><!-- Elsie --></a>, <a class="sister" href="" id="link2">Lacie</a>, <a class="sister" href="" id="link3">Tillie</a>] Lacie Tillie
注意:<!-- Elsie -->为注释内容,未输出