base64编码、bs4

时间：2017-10-19 19:16:45 阅读：306 评论：0 收藏：0 [点我收藏+]

BeautifulSoup的用法：

r=requests.get(‘http://www.qq.com/‘).text

soup=BeautifulSoup(r,‘lxml‘)

Eg：t=soup.find_all(class_=‘aabbcc‘,text=re.compile(‘\w‘))[1].string.strip()

find内，若只写有text参数则取文本；若还有标签或属性等式，则取标签。

属性class：可在标签名后直接写属性值xxx；或加个_，即class_=‘xxx‘；或改为{‘class‘:‘xxx‘}，属性名有-等标点的，也用此法。

‘a‘是搜索标签a，a=‘xxx‘是属性a。可用的搜索形式：str，正则，list，True。如搜索不含src属性的a标签：soup.find_all(‘a‘,src=False)。

获取所有的a标签及t开头的标签：soup.find_all([‘a‘,re.compile(‘^t‘)])。若无[ ]，即soup.find_all(‘a‘,‘xxx‘)，则是取class值为xxx的a标签了。

find是缺省方法，若无属性附加，则soup.find(‘div‘)可简写为soup.div；attrs是find返回对象的缺省属性，即x.attrs[‘href‘]等于x[‘href‘]；

string只能取自己的文本，后代的取不了；而stripped_strings和方法get_text(‘\n‘,strip=True)都是取对象下的所有文本，前者返回生成器，后者是str。定位css的soup.select()方法可通用。

包含 class 属性却不包含 id 属性的所有标签：

def has_class_but_no_id(tag):

return tag.has_attr(‘class‘) and not tag.has_attr(‘id‘)

soup.find_all(has_class_but_no_id)

w=标签x.标签y.extract()——把x中的y踢出，并给了w；类似于list中的pop方法。

t=soup.find_all(class_=‘login-container‘)[0]

t1=t.find(‘a‘,class_=‘item login‘).extract().get_text().strip()

print(t.get_text().strip())

print(t1)

****************************************分割线****************************************

base64编码：

#验证码图片的src常用的Data URI scheme：

from io import BytesIO

from PIL import Image

import base64,requests

url=‘https://my.fengjr.com/api/v2/captcha?_ts=35045549418.92857‘

sourceCode=requests.get(url).json()[‘captcha‘]

parseCode = sourceCode.replace(‘data:image/png;base64,‘,‘‘)

imgData = BytesIO(base64.b64decode(parseCode))

# print(textData.getvalue().decode()) #用于显示base64编码的JS、CSS、HTML代码

Image.open(imgData).show() #内存中读写bytes用BytesIO，str用StringIO

# data:,——文本数据

# data:text/plain,——文本数据

# data:text/css,——CSS代码

# data:text/css;base64,——base64编码的CSS代码

# ;javascript:;,——HTML代码

# ;javascript:;;base64,——base64编码的HTML代码

# data:text/javascript,——Javascript代码

# data:text/javascript;base64,——base64编码的js代码

# data:image/gif;base64,——base64编码的gif图片数据

# data:image/png;base64,——base64编码的png图片

# data:image/jpeg;base64,——base64编码的jpeg图片

# data:image/x-icon;base64,——base64编码的icon图片

****************************************分割线****************************************

tkinter窗体：

from tkinter import *

from tkinter import messagebox

import requests,re

from io import BytesIO

from PIL import Image

def download():

startUrl=‘http://www.uustv.com/‘

name=entry.get()

if not name:

messagebox.showinfo(‘提示‘, ‘请输入姓名！‘)

data={‘word‘:name,‘sizes‘:‘60‘,‘fonts‘:‘jfcs.ttf‘,‘fontcolor‘:‘#000000‘}

response=requests.post(url=startUrl,data=data)

response.encoding=‘utf8‘

pic=re.findall(‘tmp/\d+?.gif‘,response.text)[0]

imgUrl=startUrl+pic

print(imgUrl)

imgData=requests.get(imgUrl).content

imgData=BytesIO(imgData)

Image.open(imgData).show() #等1秒，待凹按钮复原后再关闭窗体

root=Tk()

root.title(‘个性签名‘)

root.geometry(‘480x360+600+300‘)

Label(root,text=‘hello,python‘,font=(‘华文中宋‘,20),background=‘yellow‘).grid()

entry=Entry(root,font=(‘微软雅黑‘,20))

entry.grid(row=0,column=1)

Button(root,text=‘设计签名‘,font=20,width=20,height=1,command=download).grid(row=2,column=1)

root.mainloop()

****************************************分割线****************************************

爬取糗事百科的段子：

import requests

from bs4 import BeautifulSoup

page = 1

url = ‘http://www.qiushibaike.com/text/page/‘ + str(page)

headers = {‘User-Agent‘: ‘Mozilla/4.0‘}

html = requests.get(url, headers=headers).text

soup = BeautifulSoup(html,‘lxml‘)

jokes = soup.select(‘.content > span‘)

with open(‘糗事百科.txt‘,‘w‘,encoding=‘utf8‘) as f:

for item in jokes:

f.write(‘\n‘.join(item.stripped_strings))

# f.write(item.get_text(‘\n‘,strip=True))

f.write(‘\n-----------分割线---------\n‘)

****************************************分割线****************************************

下载起点某小说的公共章节：

import requests,re

from bs4 import BeautifulSoup

def getSourceCode(url):

html = requests.get(url).text

html = BeautifulSoup(html, ‘lxml‘)

return html

def getBookName(html):

author=html.find(‘a‘,href=re.compile(‘//me.qidian.com/authorIndex.+‘)).text

bookName=html.find(‘a‘,‘act‘).text #属性是class时，可省略不写

return author+‘：‘+bookName

def getContent(html):

title=html.find(‘h3‘,‘j_chapterName‘).text

content = html.find(‘div‘,‘read-content j_readContent‘).get_text(‘\n‘)

content=content.replace(‘　　　　‘,‘　　‘).replace(‘ ‘,‘‘)

return title+content

def getNextChapter(html):

#属性名有-等标点符号的，只能用{}的写法

return ‘https:‘+html.find(‘div‘,{‘data-nurl‘:True})[‘data-nurl‘]

def main(url):

html=getSourceCode(url)

content=getContent(html)

bookName=getBookName(html)

with open(‘%s.txt‘ %bookName,‘a‘,encoding=‘utf8‘) as f:

while len(content)>500: #起点的vip章节只公开两三百字，不再提取

print(len(html),len(content),content.split(‘\n‘)[0])

f.write(content)

nextChapterUrl = getNextChapter(html)

html=getSourceCode(nextChapterUrl)

content=getContent(html)

if __name__ == ‘__main__‘:

u = ‘https://read.qidian.com/chapter/6JSeRXxo8g01/sLuDHoqD3wIex0RJOkJclQ2‘

main(u) #从该小说的第一章的网址，开始爬取

base64编码、bs4

原文：http://www.cnblogs.com/scrooge/p/7693879.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)