import requests from lxml import etree def city_page(base_url): url = base_url+‘post/‘ headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0‘, } response = requests.request(‘get‘, url=url, headers=headers) page_data = etree.HTML(response.text) data = page_data.xpath("//table[@id=‘quanguo‘]//tr") for infos in data: info = infos.xpath(".//td") for i in info: href_num = i.xpath("./a/@href") href_name = i.xpath("./a/text()") # print(href_name) # 因为最后一个元素为空所以需要判断列表值是否存在 if href_num: href = href_num[0] # 地址乱码 if href_name: dirname = href_name[0].encode(‘ISO-8859-1‘).decode(‘gbk‘) href_url = base_url+href # print(href_url) # print(dirname) post_code(href_url,dirname) def post_code(base_url,dirname): # def post_code(): url = base_url # url = ‘https://www.ip138.com/10/‘ headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0‘, } response = requests.request("get",url=url,headers=headers) # 设置解码格式 response.encoding = ‘gb2312‘ # print(response.text) page_data = etree.HTML(response.text) data = page_data.xpath("//table/tr[@bgcolor=‘#ffffff‘]") # print(data) for infos in data: info = infos.xpath("./td") # print(info) database = [] for second_info in info: en_info = second_info.xpath("string()") # print(en_info) if en_info == ‘\xa0‘: continue database.append(en_info) with open(‘./邮编‘+dirname+‘.csv‘,‘a+‘,encoding=‘utf-8‘)as f: f.write(str(database)) # pass if __name__ == ‘__main__‘: ‘‘‘ https://www.ip138.com/post/ https://www.ip138.com/10/ 北京邮编url ‘‘‘ base_url = ‘https://www.ip138.com/‘ city_page(base_url)
原文:https://www.cnblogs.com/lizhihoublog/p/12550232.html