首页 > 其他 > 详细

全国邮编爬取

时间:2020-03-23 10:22:45      阅读:37      评论:0      收藏:0      [点我收藏+]
import requests
from lxml import etree

def city_page(base_url):
    url = base_url+post/
    headers={
        User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0,
    }
    response = requests.request(get, url=url, headers=headers)
    page_data = etree.HTML(response.text)
    data = page_data.xpath("//table[@id=‘quanguo‘]//tr")

    for infos in data:
        info = infos.xpath(".//td")
        for i in info:
            href_num = i.xpath("./a/@href")
            href_name = i.xpath("./a/text()")
            # print(href_name)
            # 因为最后一个元素为空所以需要判断列表值是否存在
            if href_num:
                href = href_num[0]
            # 地址乱码
            if href_name:
                dirname = href_name[0].encode(ISO-8859-1).decode(gbk)
            href_url = base_url+href
            # print(href_url)
            # print(dirname)
            post_code(href_url,dirname)


def post_code(base_url,dirname):
# def post_code():
    url = base_url
    # url = ‘https://www.ip138.com/10/‘
    headers={
        User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0,
    }
    response = requests.request("get",url=url,headers=headers)
    # 设置解码格式
    response.encoding = gb2312
    # print(response.text)
    page_data = etree.HTML(response.text)
    data = page_data.xpath("//table/tr[@bgcolor=‘#ffffff‘]")
    # print(data)
    for infos in data:
        info = infos.xpath("./td")
        # print(info)
        database = []
        for second_info in info:
            en_info = second_info.xpath("string()")
            # print(en_info)
            if en_info == \xa0:
                continue
            database.append(en_info)
        with open(./邮编+dirname+.csv,a+,encoding=utf-8)as f:
            f.write(str(database))
        #     pass



if __name__ == __main__:
    ‘‘‘
    https://www.ip138.com/post/
    https://www.ip138.com/10/   北京邮编url

    ‘‘‘
    base_url = https://www.ip138.com/
    city_page(base_url)

 

全国邮编爬取

原文:https://www.cnblogs.com/lizhihoublog/p/12550232.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!