首页 > 其他 > 详细

收录及出图导出excel表

时间:2019-07-14 17:31:55      阅读:94      评论:0      收藏:0      [点我收藏+]
# coding=utf-8
#@auther:Mana_菜小刀
import requests
import queue
import threading
import xlrd
import xlwt
from lxml import etree
from xlutils.copy import copy
from requests.packages.urllib3.exceptions import InsecureRequestWarning

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
}

myxls = xlwt.Workbook()
sheet1 = myxls.add_sheet(收录search)
lst_name = [url, 收录/未收录, 图片]
for i in range(len(lst_name)):
    sheet1.write(0, i, lst_name[i])
myxls.save(result.xls)

def log(*args,**kwargs):
    print(*args,**kwargs)


class baiduSpider(threading.Thread):
    def __init__(self, queue_li, name):
        threading.Thread.__init__(self)
        self._queue = queue_li
        self._name = name

    def run(self):
        while not self._queue.empty():
            url = self._queue.get()
            try:
                self.get_url(url)
            except Exception as e:
                log(e)
                pass

    def get_url(self,url):
        requests.adapters.DEFAULT_RETRIES = 5
        r = requests.session()
        r.keep_alive = False
        s = r.get(url=url, headers=headers)
        #log(s)
        xpather = etree.HTML(s.text)

        strs = xpather.xpath(//span[@class="nums_text"]//text())
        imgs = xpather.xpath(//img[@class="c-img c-img6"]/@src)
        #log(strs, imgs)
        search_mo = [收录,未收录]
        img_mo = [有图,无图]
        url_mo = url.replace(http://www.baidu.com/s?wd=,‘‘)

        workbook = xlrd.open_workbook(result.xls, formatting_info=True)
        sheet = workbook.sheet_by_index(0)
        rowNum = sheet.nrows
        colNum = sheet.ncols
        newbook = copy(workbook)
        newsheet = newbook.get_sheet(0)

        if strs[0] != "百度为您找到相关结果约0个" and len(imgs) > 0:
            newsheet.write(rowNum,0,url_mo)
            newsheet.write(rowNum, 1, search_mo[0])
            newsheet.write(rowNum, 2, img_mo[0])
            log(search_mo[0],,img_mo[0],,url_mo)
            #newbook.save(‘result.xls‘)
        elif strs[0] != "百度为您找到相关结果约0个" and len(imgs) == 0:
            newsheet.write(rowNum, 0, url_mo)
            newsheet.write(rowNum, 1, search_mo[0])
            newsheet.write(rowNum, 2, img_mo[1])
            log(search_mo[0],,img_mo[1],,url_mo)
            #newbook.save(‘result.xls‘)
        else:
            newsheet.write(rowNum, 0, url_mo)
            newsheet.write(rowNum, 1, search_mo[1])
            newsheet.write(rowNum, 2, img_mo[1])
            log(search_mo[1],,img_mo[1],,url_mo)
        newbook.save(result.xls)

def main():
    queue_li = queue.Queue()
    threads = []
    thread_count = 10
    myxls = xlwt.Workbook()
    sheet1 = myxls.add_sheet(IDF)
    ‘‘‘把‘urls‘改成自己的txt文档名称:‘‘‘
    with open(urls, r, encoding=utf-8, errors="ignore") as f:
        content = f.read()
        urls = content.split(\n)
    for url in urls:
        if len(url) > 0:
            url_search = url
            queue_li.put(http://www.baidu.com/s?wd={}.format(url_search))

    for i in range(thread_count):
        spider = baiduSpider(queue_li, url_search)
        threads.append(spider)

    for i in threads:
        i.start()

    for i in threads:
        i.join()


    ‘‘‘log("Mana好伟大!(^-^)V")‘‘‘

if __name__ == __main__:
    log("Mana好伟大!(^-^)V")
    main()

 

收录及出图导出excel表

原文:https://www.cnblogs.com/mana66ccff/p/11184899.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!