1 link = ‘http://news.sina.com.cn/‘ 2 res = requests.get(link) 3 res.encoding = ‘utf-8‘ #设置文本的编码格式是utf-8的文件格式 4 soup = BeautifulSoup(res.text, ‘html.parser‘) #通过res中的成员变量.text来得到HTML的文本res.text 5 alink = soup.select(‘a‘) #选择class模块中的a来作为提取的目标的判定条件:<a href="http://mil.news.sina.com.cn/"><span class="titName ptn_05">军事</span></a> 6 DATA = set() #创建一个set集合 7 Cookies = [‘军事‘, ‘教育‘, ‘科技‘, ‘文化‘] #创建一个目标提取项目 8 for link in alink: 9 if link.text in Cookies: #alink中包含了所有的以a开头的class文本,link.text提取了其中一个link的文本内容 10 Temp = ‘The title of link ‘+link.text+‘ is :‘+link[‘href‘] #使用link[‘herf‘]来取herf对应的字典字符串,也就是对应的链接 11 DATA.add(Temp) #在DATA数据中加入满足正则要求的文本文件 12 for Info in DATA: 13 print(Info) 14 Check = input(‘Please input the content you want to see:‘) 15 Str = ‘‘ 16 for W in list(DATA): 17 if W.find(Check) != -1: 18 Str = Str + W + ‘\n‘ 19 File = open(‘C:\\Users\\Administrator\Desktop\Python爬虫准备\demo\Info1.txt‘, ‘w‘) #将取得的文件写入到文件夹当中 20 File.writelines(Str) 21 File.close() 22 23 HTML = ‘http://book.weibo.com/newcms/tp_p4c51t160.html‘ 24 res = requests.get(HTML) 25 res.encoding = ‘utf-8‘ 26 soup = BeautifulSoup(res.text, ‘html.parser‘) 27 title = soup.select(‘.S_title‘) 28 print(title[0].text) 29 content = soup.select(‘.S_explain‘) 30 print(content[0].text) 31 Count = soup.select(‘.book_vote‘) 32 Bname = soup.select(‘.book_name‘) 33 Aname = soup.select(‘.book_author‘) 34 Blink = soup.select(‘a‘) 35 Info = ‘‘ 36 for i in range(len(Bname)): 37 Info = Info + Bname[i].text + ‘-->‘ + Aname[i].text + ‘(‘ +38 Count[i].text.replace(‘ ‘, ‘‘) + ‘)‘ + ‘--link:‘ +39 Blink[i*4][‘href‘]+‘\n\n‘ 40 print(Info) 41 Data = title[0].text + ‘\n‘ + content[0].text + ‘\n‘ + Info 42 F = open(‘C:\\Users\\Administrator\Desktop\Python爬虫准备\demo\Info2.txt‘, ‘w‘) 43 F.writelines(Data) 44 F.close() 45 46 Init_link = ‘https://www.douyu.com/directory/all‘ 47 Data = requests.get(Init_link) 48 Data.encoding = ‘utf-8‘ 49 soup = BeautifulSoup(Data.text, ‘html.parser‘) 50 Res = soup.select(‘.mes‘) 51 Count = soup.select(‘p‘) 52 for i in Res: 53 if i.text.find(‘英雄联盟‘) != -1: 54 Str = i.text.replace(‘ ‘, ‘‘).replace(‘\n‘, ‘‘) 55 print(Str) 56 print(Str[len(Str)-3:len(Str)]) 57 print(len(Res))