马蜂窝数据采集,难点sn 加密 采用固定字符串 加上参数转化md5值截取部分md5
import hashlib import pymongo import pandas import requests import time from pyquery import PyQuery as pq from retry import retry class ScenicSpot: def __init__(self): self.scenic_url = "http://www.mafengwo.cn/ajax/router.php" self.headers = { ‘User-Agent‘: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" } self.client = pymongo.MongoClient().ScenicSpot.ScenicSpot self.pio_headers = { ‘Host‘: ‘www.mafengwo.cn‘, ‘Connection‘: ‘keep-alive‘, ‘Upgrade-Insecure-Requests‘: ‘1‘, ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9‘, ‘Accept-Encoding‘: ‘gzip, deflate‘, ‘Accept-Language‘: ‘zh-CN,zh;q=0.9‘, ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36‘, ‘Cookie‘: ‘PHPSESSID=o98g37f4squ0aq4ubcr07d84f2; mfw_uuid=5fe69be4-4047-f9a5-fa34-c5d367eb316b; oad_n=a%3A3%3A%7Bs%3A3%3A%22oid%22%3Bi%3A1029%3Bs%3A2%3A%22dm%22%3Bs%3A15%3A%22www.mafengwo.cn%22%3Bs%3A2%3A%22ft%22%3Bs%3A19%3A%222020-12-26+10%3A11%3A48%22%3B%7D; __jsluid_h=8b69bee30f0e6459c08df76385484c05; __omc_chl=; __omc_r=; __mfwc=direct; __mfwa=1608948709480.47993.1.1608948709480.1608948709480; __mfwb=0418cf79c433.1.direct; __mfwlv=1608948709; __mfwvn=1; __mfwlt=1608948709; Hm_lvt_8288b2ed37e5bc9b4c9f7008798d2de0=1608948710; Hm_lpvt_8288b2ed37e5bc9b4c9f7008798d2de0=1608948710; uva=s%3A78%3A%22a%3A3%3A%7Bs%3A2%3A%22lt%22%3Bi%3A1608948710%3Bs%3A10%3A%22last_refer%22%3Bs%3A6%3A%22direct%22%3Bs%3A5%3A%22rhost%22%3Bs%3A0%3A%22%22%3B%7D%22%3B; __mfwurd=a%3A3%3A%7Bs%3A6%3A%22f_time%22%3Bi%3A1608948710%3Bs%3A9%3A%22f_rdomain%22%3Bs%3A0%3A%22%22%3Bs%3A6%3A%22f_host%22%3Bs%3A3%3A%22www%22%3B%7D; __mfwuuid=5fe69be4-4047-f9a5-fa34-c5d367eb316b; bottom_ad_status=1; UM_distinctid=1769cd106714ce-0be3d1620af6d5-3e604809-1fa400-1769cd106729c0; CNZZDATA30065558=cnzz_eid%3D1359122677-1608945050-%26ntime%3D1608945050; __jsl_clearance=1608948735.479|0|LP8kMR7h6lJOyF4aqU9yvnUg4Ek%3D‘ } self.all_list = [] tunnel = "tps198.kdlapi.com:15818" username = "t10886694756492" password = "bjgfg7jn" self.proxies = { "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}, "https": "https://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel} } def par(self, t): hl = hashlib.md5() hl.update(t) return hl.hexdigest()[2:12] def get_page(self): for i in range(18, 21): t = time.time() * 1000 print(‘第‘ + str(i) + ‘页‘) page = str(i) qdata = ‘{"_ts":"‘ + str(t) + ‘","iMddid":"10794","iPage":"‘ + str( page) + ‘","iTagId":"0","sAct":"KMdd_StructWebAjax|GetPoisByTag"}c9d6618dbc657b41a66eb0af952906f1‘ sn = self.par(qdata.encode(‘utf-8‘)) data = { ‘sAct‘: ‘KMdd_StructWebAjax|GetPoisByTag‘, ‘iMddid‘: ‘10794‘, ‘_ts‘: t, ‘iPage‘: page, ‘iTagId‘: ‘0‘, ‘_sn‘: sn } response = requests.post(url=self.scenic_url, headers=self.headers, data=data, proxies=self.proxies) data = response.json()[‘data‘][‘list‘] doc = pq(data) li_list = doc(‘li‘).items() for li in li_list: title = li(‘a‘).attr("title") title_url = "http://www.mafengwo.cn" + li(‘a‘).attr("href") self.get_point_info(title_url, title) time.sleep(2) def get_point_info(self, url, title): """ cookie 有问题 """ poi_dict = {} poi_dict[‘景区名称‘] = title self.pio_headers[‘Referer‘] = url print(url) poi_resp = requests.get(url, headers=self.pio_headers) if poi_resp.status_code == 521: raise poi_doc = pq(poi_resp.content) introduction = poi_doc(‘.summary‘).text() poi_dict[‘景区介绍‘] = introduction place_station = poi_doc(‘.mod.mod-location p‘).text() dl_list = poi_doc(‘.mod.mod-detail dl‘).items() for dl in dl_list: dt = dl(‘dt‘).text() if ‘门票‘ in dt: dd = dl(‘dd‘).text() poi_dict[‘门票‘] = dd elif ‘开放时间‘ in dt: dd = dl(‘dd‘).text() poi_dict[‘开放时间‘] = dd else: continue poi_dict[‘景点位置‘] = place_station self.client.insert_one(poi_dict) print(poi_dict) self.all_list.append(poi_dict) def run(self): self.get_page() pandas.DataFrame(self.all_list).to_excel(‘旅游景点.xlsx‘, index=False) if __name__ == ‘__main__‘: ScenicSpot().run()
原文:https://www.cnblogs.com/lqn404/p/14231287.html