1 # coding=utf-8 2 """ 3 用类封装爬虫任务, 4 目的,获取豆瓣某地区安热度排列的全部电影 5 思路: 6 chorme分析目标url, 7 构建url 8 发请求获取数据 9 保存数据 10 循环上三步直到最后一页 11 注意:目前代码中的url地址已经失效 12 """ 13 import requests 14 import json 15 16 class DoubanSpider: 17 def __init__(self): 18 self.url_temp_list = [ 19 { 20 "url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_american_hot/items?start={}&count=18&loc_id=108288", 21 "country": "US" 22 }, 23 { 24 "url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_english_hot/items?start={}&count=18&loc_id=108288", 25 "country": "UK" 26 }, 27 { 28 "url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_domestic_hot/items?start={}&count=18&loc_id=108288", 29 "country": "CN" 30 } 31 ] 32 self.headers = { 33 "User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36", 34 "Referer": "https://m.douban.com/movie/" 35 } 36 37 def parse_url(self, url): # 发送请求,获取响应 38 print(url) 39 response = requests.get(url, headers=self.headers) 40 return response.content.decode() 41 42 def get_content_list(self, json_str): # 提取数据 43 dict_ret = json.loads(json_str) 44 content_list = dict_ret["subject_collection_items"] 45 total = dict_ret["total"] # 代表总数量 不一定正确 46 return content_list, total 47 48 def save_content_list(self, content_list,country): # 保存 49 with open("douban.txt", "a", encoding="utf-8") as f: 50 for content in content_list: 51 content["country"] = country 52 f.write(json.dumps(content, ensure_ascii=False)) 53 f.write("\n") # 写入换行符,进行换行 54 print("保存成功") 55 56 def run(self): # 实现主要逻辑 57 for url_temp in self.url_temp_list: 58 num = 0 # num是url中的start参数,表示起始页 59 total = 100 # 假设有第一页 60 while num < total + 18: # 不能等于,因为等于意味着上一次已经把最后一页取完了 61 # 1.start_url 62 url = url_temp["url_temp"].format(num) 63 # 2.发送请求,获取响应 64 json_str = self.parse_url(url) 65 # 3.提取是数据 66 content_list, total = self.get_content_list(json_str) 67 68 # 4.每一页都保存一下,而不是全部获取后再保存,防止中间出问题了,前面获取的都白费了。 69 self.save_content_list(content_list,url_temp["country"]) 70 # if len(content_list)<18: # 这种方式判断是否取到尾也可以 71 # break 72 # 5.构造下一页的url地址,进入循环 73 num += 18 74 75 76 if __name__ == ‘__main__‘: 77 douban_spider = DoubanSpider() 78 douban_spider.run()
1 """ 2 套路:登录首页的时候,已经给浏览器设置cookies,此时未激活 3 登录成功后返回假的cookies,激活未激活的cookies, 4 5 """ 6 import requests 7 from bs4 import BeautifulSoup 8 9 headers = { 10 "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36", 11 } 12 13 index = requests.get("https://dig.chouti.com/", headers=headers) 14 cookies = index.cookies.get_dict() 15 16 17 # ===========================点赞================= 18 19 # 1.登录 20 login = requests.post( 21 "https://dig.chouti.com/login", 22 data={ 23 "phone": 8615026809593, 24 "password":‘dajiahaa‘, 25 }, 26 headers=headers, 27 cookies=cookies) 28 29 # 2.点赞 30 dizan = requests.post( 31 url="https://dig.chouti.com/link/vote?linksId=25389911", 32 cookies=cookies, 33 headers=headers) 34 35 print(dizan.text)
1 """ 2 套路: 3 - 带请求头 4 - 带cookie 5 - 请求体中: 6 commit:Sign in 7 utf8:? 8 authenticity_token:放在页面隐藏表单中 9 login:asdfasdfasdf 10 password:woshiniba8 11 12 """ 13 import requests 14 from bs4 import BeautifulSoup 15 16 headers = { 17 "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36", 18 } 19 20 login = requests.get( 21 "https://github.com/login", 22 headers=headers, 23 ) 24 cookies = login.cookies.get_dict() 25 login_par = BeautifulSoup(login.content, ‘html.parser‘) 26 token_input = login_par.find(name=‘input‘, attrs={"name": "authenticity_token"}) 27 28 authenticity_token = token_input.attrs.get("value") 29 # 1.登录 30 re_login = requests.post( 31 "https://github.com/session", 32 data={ 33 "commit": "Sign in", 34 "utf8":"?", 35 "login": "cpcp@163.com", 36 "password": ‘cs11187‘, 37 "authenticity_token": authenticity_token, 38 "webauthn-support": "supported" 39 }, 40 cookies=cookies, 41 headers={ 42 "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36", 43 "Referer": "https://github.com/login" 44 } 45 ) 46 47 print(re_login.text)
原文:https://www.cnblogs.com/carlous/p/10624842.html