首页 > 编程语言 > 详细

使用requests、BeautifulSoup、线程池爬取艺龙酒店信息并保存到Excel中

时间:2019-03-11 13:09:13      阅读:291      评论:0      收藏:0      [点我收藏+]
import requests
import time, random, csv
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from threadpool import ThreadPool, makeRequests


def request_url(city_code, city_name, city_letter):
    """
    请求主页
    """
    with open(has_elong.json, a+, encoding=utf-8) as hs:
        hs.write(city_code + \n)
    hs.close()
    if city_code and int(city_code) < 1000:
        city_code = 0 + str(city_code)
    else:
        city_code = str(city_code)
    with open(艺龙/%s.csv % city_name, w+, encoding=utf-8-sig) as f:
        cs = csv.writer(f, dialect=excel)
        # [酒店名称,价格,地址,星级,主题,可供服务,酒店信息]
        cs.writerow([酒店名称, 价格, 地址, 星级, 主题, 可供服务, 酒店信息])
        # 循环1-89页
        for n in range(1, 89):
            url = http://hotel.elong.com/%s/ % city_letter
            data = {
                "code": "7140144",
                "listRequest.areaID": "",
                "listRequest.bookingChannel": "1",
                "listRequest.cardNo": "192928",
                "listRequest.checkInDate": "2019-03-02 00:00:00",  # 入住时间
                "listRequest.checkOutDate": "2019-03-03 00:00:00",  # 离开时间
                "listRequest.cityID": city_code,
                "listRequest.cityName": city_name,  # 北京等地区
                "listRequest.customLevel": "11",
                "listRequest.distance": "20",
                "listRequest.endLat": "0",
                "listRequest.endLng": "0",
                "listRequest.facilityIds": "",
                "listRequest.highPrice": "0",
                "listRequest.hotelBrandIDs": "",
                "listRequest.isAdvanceSave": "false",
                "listRequest.isAfterCouponPrice": "true",
                "listRequest.isCoupon": "false",
                "listRequest.isDebug": "false",
                "listRequest.isLimitTime": "false",
                "listRequest.isLogin": "false",
                "listRequest.isMobileOnly": "true",
                "listRequest.isNeed5Discount": "true",
                "listRequest.isNeedNotContractedHotel": "false",
                "listRequest.isNeedSimilarPrice": "false",
                "listRequest.isReturnNoRoomHotel": "true",
                "listRequest.isStaySave": "false",
                "listRequest.isTrace": "false",
                "listRequest.isUnionSite": "false",
                "listRequest.keywords": "",
                "listRequest.keywordsType": "0",
                "listRequest.language": "cn",
                "listRequest.listType": "0",
                "listRequest.lowPrice": "0",
                "listRequest.orderFromID": "50",
                "listRequest.pageIndex": n,  # 翻页
                "listRequest.pageSize": "20",
                "listRequest.payMethod": "0",
                "listRequest.personOfRoom": "0",
                "listRequest.poiId": "0",
                "listRequest.promotionChannelCode": "0000",
                "listRequest.proxyID": "ZD",
                "listRequest.rankType": "0",
                "listRequest.returnFilterItem": "true",
                "listRequest.sellChannel": "1",
                "listRequest.seoHotelStar": "0",
                "listRequest.sortDirection": "1",
                "listRequest.sortMethod": "1",
                "listRequest.starLevels": "",
                "listRequest.startLat": "0",
                "listRequest.startLng": "0",
                "listRequest.taRecommend": "false",
                "listRequest.themeIds": "",
                "listRequest.ctripToken": "1c06a555-04ce-4884-aa05-e6f92ad0e84e",
                "listRequest.elongToken": "jc94shhj-d5a1-4092-8060-828b168dbb61"
            }
            headers = {
                Accept: application/json, text/javascript, */*; q=0.01,
                Accept-Encoding: gzip, deflate,
                Accept-Language: zh-CN,zh;q=0.8,
                Cache-Control: no-cache,
                Content-Length: 1599,
                Content-Type: application/x-www-form-urlencoded; charset=UTF-8,
                # ‘Cookie‘:‘……61b8-48a1-b398-8b9ec1903f05……‘,
                Host: hotel.elong.com,
                Origin: http://hotel.elong.com,
                Pragma: no-cache,
                Proxy-Connection: keep-alive,
                Referer: http://hotel.elong.com/%s/ % city_letter,
                User-Agent: UserAgent(verify_ssl=False).random,
                X-Requested-With: XMLHttpRequest
            }
            try:
                time.sleep(random.randint(1, 4))
                res = requests.get(url, data=data, headers=headers)
                dete_list = get_info_and_req_details(res.text)
                for data in dete_list:
                    cs.writerow(data)
            except Exception:
                continue
    f.close()


def get_info_and_req_details(html):
    """
    清洗该页列表数据并向请求各个酒店的详情页
    page_list = [酒店名称,价格,地址,星级,主题,可供服务,酒店信息]
    """
    bs = BeautifulSoup(html, "lxml")
    h_list = bs.find_all(div, attrs={class: h_item})
    page_list = []
    i = 0
    for hotel in h_list:
        if i < 25:
            try:
                hotel_name = hotel.find(div, attrs={class: h_info_pic}).find(img).get(alt)
                hotel_price = str(hotel.find(span, attrs={class: h_pri_num}).get_text()) + 元起
                hotel_add = hotel.find(p, attrs={class: h_info_b2}).find(a).get_text().replace([, ‘‘).replace(], ‘‘)
                hotel_ress = hotel.find(span, attrs={class: l1}).get(data-hoteladdress)
                try:
                    hotel_grade = hotel.find(b, attrs={class: icon_stars}).get(title)
                except Exception:
                    hotel_grade = 经济型
                try:
                    hotel_theme = hotel.find(div, attrs={class: tagList}).get_text().replace(\n, ,)
                except Exception:
                    hotel_theme = ‘‘
                try:
                    hotel_link = hotel.find(div, attrs={class: h_info_pic}).find(a).get(href)
                    time.sleep(random.randint(1, 3))
                    detail_html = requests.get(http://hotel.elong.com%s#hotelContent % hotel_link)
                    server, hotel_info = get_details(detail_html.text)
                except Exception:
                    server = ‘‘
                    hotel_info = ‘‘
            except Exception:
                continue
            page_list.append([hotel_name, hotel_price, str(hotel_add)+str(hotel_ress), hotel_grade, hotel_theme, server, hotel_info])
        i += 1
    return page_list


def get_details(detail_html):
    """
    清洗详情页数据
    """
    detail = BeautifulSoup(detail_html, lxml)
    server = ‘‘
    hotel_info = ‘‘
    try:
        server = detail.find(ul, attrs={class: dview_icon_list}).get_text().replace(\n, ,)
        hotel_info = detail.find(div, attrs={class: dview_info}).get_text().replace(\n, ,).replace(\t, ,)
    except Exception:
        return server, hotel_info
    return server, hotel_info


if __name__ == __main__:
    has_num = []
    req_list = []
  // 地址爬取请借鉴爬取携程酒店信息
for line in open(elong.json, encoding=utf-8): line_list = line.replace("\n", "").split(,) for has in open("has_elong.json", encoding=utf-8): has_num.append(int(has.replace(\n, ‘‘))) if int(line_list[0]) in has_num: continue # request_url(line_list[0], line_list[1], line_list[2]) line_tuple = (line_list, None) req_list.append(line_tuple) pool = ThreadPool(3) requests_list = makeRequests(request_url, req_list) [pool.putRequest(req) for req in requests_list] pool.wait()

 

使用requests、BeautifulSoup、线程池爬取艺龙酒店信息并保存到Excel中

原文:https://www.cnblogs.com/wuyan717/p/10509740.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!