python 未知

时间：2017-12-31 11:35:10 阅读：254 评论：0 收藏：0 [点我收藏+]

import time
import requests
from bs4 import BeautifulSoup
import threading


def format_str(s):
    return s.replace("\n","").replace("","").replace("\t",‘‘)



def get_urls_in_pages(from_page_num,to_page_num):
    urls=[]
    search_word=‘计算机‘
    url_part_1=‘http://www.phei.com.cn/moudle/goods/‘\
               ‘searchkey.jsp? Page=‘
    url_part_2=‘&Page=2&searchKey=‘
    for i in range(from_page_num,to_page_num+1):
        urls.append(url_part_1
                    +str(i)+
                    url_part_2+search_word)
    all_href_list=[]
    for url in urls:
        print(url)
        resp=requests.get(url)
        bs=BeautifulSoup(resp.text)
        a_list=bs.find_all(‘a‘)
        needed_list=[]
        for a in a_list:
            if ‘href‘in a.attrs:
                href_val=a[‘href‘]
                title=a.text
                if ‘bookid‘in href_val and ‘shopcar0.jsp‘\
                             not in href_val and title !=‘‘:
                    if [title,href_val] not in needed_list:
                        needed_list.append([format_str(title),
                                            format_str(href_val)])
        all_href_list+=needed_list
    all_href_file=open(str(from_page_num)+‘_‘+
                     str(to_page_num)+‘_‘+
                     ‘all_hrefs.txt‘,‘w‘)
    for href in all_href_list:
        all_href_file.write(‘\t‘.join(href)+‘\n‘)
    all_href_file.close()
    print(from_page_num,to_page_num,len(all_href_list))

python 未知

原文：https://www.cnblogs.com/Justice-V/p/8157180.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)