1 from bs4 import BeautifulSoup 2 3 4 5 6 html = """ 7 <html><head><title>This is a python demo page</title></head> 8 <body> 9 <p class="title"><a>The demo python introduces several python courses.</a></p> 10 <p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses: 11 <a href="http://www.icourse163.org/course/BIT-268001" class="py1" id="link1"><b class="element">Basic Python</b></a> and <a href="http://www.icourse163.org/course/BIT-1001870001" class="py2" id="link2">Advanced Python</a>.</p> 12 </body></html> 13 """ 14 15 soup = BeautifulSoup(html,‘lxml‘) 16 #基本使用 17 # print(soup.prettify()) 18 # print(soup.title.string) 19 20 #标签选择器 21 #选择元素 22 # print(soup.title) 23 # print(type(soup.title)) 24 # print(soup.head) 25 # print(soup.p)#返回第一个Tag 26 # 27 # #获取名称 28 # 29 # print(soup.title.name) 30 # 31 # #获取属性 32 # print(soup.a.attrs[‘href‘]) 33 # print(soup.a[‘href‘]) 34 # 35 # #获取内容 36 # print(soup.p.string) 37 # 38 # #嵌套选择 39 # print(soup.head.title.string) 40 41 #子节点和子孙节点 42 # print(soup.body.contents)#获取子节点,返回列表类型 43 # print(soup.body.children) #返回迭代器类型 44 # for i,child in enumerate(soup.body.children): 45 # print(i,child) 46 47 # print(soup.body.descendants) #子孙节点,返回迭代类型 48 # for i,child in enumerate(soup.body.descendants): 49 # print(i,child) 50 51 #父节点和祖先节点 52 # print(soup.a.parent) 53 # 54 # print(list(enumerate(soup.a.parents))) 55 56 #兄弟节点 57 # print(list(enumerate(soup.a.next_siblings))) 58 # print(list(enumerate(soup.a.previous_siblings))) 59 60 #标准选择器 61 #find_all(name,attrs,recursive,text,**kwargs) 可根据标签名、属性、内容查找文档 62 #name 63 # print(soup.find_all(‘p‘)) 64 # print(type(soup.find_all(‘p‘)[0])) 65 # for i in soup.find_all(‘p‘): 66 # print(i.find_all(‘a‘)) #嵌套选择 67 68 # #attrs 69 # print(soup.find_all(attrs={‘href‘:"http://www.icourse163.org/course/BIT-268001"})) 70 # print(soup.find_all(attrs={‘id‘:‘link1‘})) 71 # 72 # print(soup.find_all(id=‘link1‘)) 73 # print(soup.find_all(class_=‘py1‘)) 74 # 75 # #text查找内容 76 # print(soup.find_all(text=‘This is a python demo page‘))#用来做内容匹配 77 # 78 # #find(name,attrs,recursive,text,**kwargs) 79 # #用法一样,find只是返回单个元素,find_all返回所有元素 80 # print(soup.find(‘p‘,attrs={‘class‘:‘course‘})) 81 # print(type(soup.find(‘p‘))) 82 83 84 #CSS选择器,返回列表 85 #通过select()直接传入CSS选择器即可完成选择 86 #选择class属性就直接用‘.‘代替,例:class=‘course’--》.course;#代表id 87 # print(soup.select(‘.course .py1‘)) 88 # print(soup.select(‘p a‘))#嵌套选择 89 # print(soup.select(‘#link1 .element‘)) 90 # print(type(soup.select(‘p‘)[0])) 91 # 92 # #嵌套选择 93 # for p in soup.select(‘p‘): 94 # print(p.select(‘a‘)) 95 # 96 97 #获取属性 98 for p in soup.select(‘p‘): 99 print(p[‘class‘]) 100 print(p.attrs[‘class‘]) 101 102 103 #获取内容 104 for p in soup.select(‘p‘): 105 print(p.get_text())
原文:https://www.cnblogs.com/qqw-1995/p/9860501.html