from selenium import webdriver from scrapy.selector import Selector import time import random import pymysql from urllib import parse import re import os # a = Selector(text=webdriver.page_source) # if a.xpath(‘//*[@id="J_submit"]‘): # time.sleep(15) # for i in Selector(text=webdriver.page_source).xpath(‘//*[@id="J_ShopSearchResult"]/div/div[2]/div/dl‘): # bd_pig = i.xpath("./dt/a/img/@src").re(‘.*(img.*?jpg)‘) # bd_name = i.xpath(‘./dd[1]/a/text()‘).extract_first(‘‘) # bd_id = i.xpath(‘./dd[1]/a/@href‘).extract_first(‘‘) # bd_much = i.xpath(‘./dd[1]/div/div[1]/span[2]/text()‘).extract_first(‘‘) # bd_liang = i.xpath(‘./dd[1]/div/div[last()]/span/text()‘).extract_first(‘‘) # # sql = "INSERT INTO " + i.split(",")[0] + "( `id`,图片链接,价格,标题,销量) VALUES (%s,%s,%s,%s,%s)" # cursor.execute(sql, # (bd_id, bd_pig, bd_much, bd_name, bd_liang)) # self.connection.commit() # else: # for i in Selector(text=webdriver.page_source).xpath(‘//*[@id="J_ShopSearchResult"]/div/div[2]/div/dl‘): # bd_pig = i.xpath("./dt/a/img/@src").re(‘.*(img.*?jpg)‘) # bd_name =‘‘.join(re.findall(‘[\u4e00-\u9fa5]‘, i.xpath(‘./dd[1]/a/text()‘).extract_first(‘‘))) # bd_id = ‘‘.join(re.findall(‘\d‘, i.xpath(‘./dd[1]/a/@href‘).extract_first(‘‘))) # bd_much = i.xpath(‘./dd[1]/div/div[1]/span[2]/text()‘).extract_first(‘‘) # bd_liang = i.xpath(‘./dd[1]/div/div[last()]/span/text()‘).extract_first(‘‘) # # sql = "INSERT INTO " + shop.split(",")[0] + "( `id`,图片链接,价格,标题,销量) VALUES (%s,%s,%s,%s,%s)" # cursor.execute(sql, # (bd_id, bd_pig, bd_much, bd_name, bd_liang)) # conection.commit() class spider(object): def chul3(self,dates): a = Selector(text=dates) next_url = a.xpath(‘//*[@id="J_ShopSearchResult"]/div/div[2]/div[10]/a[11]/@href‘).extract_first("") return ‘https:‘+next_url chuli=spider() conection = pymysql.connect(host=‘localhost‘,user=‘root‘,password=‘123‘,db=‘7.24测试‘,charset=‘utf8mb4‘,cursorclass=pymysql.cursors.DictCursor) with conection.cursor() as cursor: sql1 = "select * from 商品id" cursor.execute(sql1) shop_id = cursor.fetchall() shop_oldid=[i[‘id‘] for i in shop_id] sql1 = ‘‘‘ SELECT `商品id`.id, `上架时间`,‘1天销量‘ as 日期 FROM `商品id` WHERE TIMESTAMPDIFF(DAY,`上架时间`,CURDATE()) =1 union SELECT `商品id`.id, `上架时间`,‘7天销量‘ as 日期 FROM `商品id` WHERE TIMESTAMPDIFF(DAY,`上架时间`,CURDATE()) =7 union SELECT `商品id`.id, `上架时间`,‘30天销量‘ as 日期 FROM `商品id` WHERE TIMESTAMPDIFF(DAY,`上架时间`,CURDATE()) =30‘‘‘ cursor.execute(sql1) shop_id = cursor.fetchall() shop_olxx = [i for i in shop_id] conection.commit() cursor =conection.cursor() webdriver = webdriver.Ie() url = ‘https://login.taobao.com/member/login.jhtml?spm=a21bo.50862.754894437.1.5dcec6f76Oq9Wh&f=top&redirectURL=https%3A%2F%2Fwww.taobao.com%2F%3Fspm%3Da1z10.1-c-s.1581860521.1.559a715a3EnsHq‘ webdriver.get(url) time.sleep(20) def lll(url): webdriver.implicitly_wait(50) webdriver.get(url) myDynamicElement = webdriver.find_element_by_class_name(‘pagination‘) a=webdriver.page_source time.sleep(random.randrange(2,6)) selects=Selector(text=a) for i in selects.xpath(‘//*[@id="J_ShopSearchResult"]/div/div[2]/div/dl‘): bd_pig = i.xpath("./dt/a/img/@src").re(‘(.*)_‘) bd_name = ‘‘.join(re.findall(‘[\u4e00-\u9fa5]‘, i.xpath(‘./dd[1]/a/text()‘).extract_first(‘‘))) bd_id = ‘‘.join(re.findall(‘\d‘, i.xpath(‘./dd[1]/a/@href‘).extract_first(‘‘))) bd_much = i.xpath(‘./dd[1]/div/div[1]/span[2]/text()‘).extract_first(‘‘) bd_idlian=‘http://item.taobao.com/item.htm?id=‘+bd_id bd_liang = i.xpath(‘./dd[1]/div/div[last()]/span[last()]/text()‘).extract_first(‘‘) if bd_id not in shop_oldid: sql = "INSERT INTO 商品id (`品牌`, `id`,图片链接,价格,标题,商品地址) VALUES (%s,%s,%s,%s,%s,%s)" cursor.execute(sql, (shop.split(",")[0], bd_id, bd_pig, bd_much, bd_name,bd_idlian)) conection.commit() webdriver.implicitly_wait(50) webdriver.get(‘http://item.taobao.com/item.htm?id=‘+bd_id) myDynamicElement = webdriver.find_element_by_class_name(‘tb-price-spec‘) time.sleep(random.randrange(2, 6)) date=webdriver.page_source select_xixi = Selector(text=date) liem = select_xixi.xpath(‘//*[@id="J_TMySize"]/@data-value‘).extract_first("") sql = ‘update `商品id` set `商品id`.`类目` = %s where id = %s‘ cursor.execute(sql, (liem, bd_id)) conection.commit() c=1 ee=1 for i in select_xixi.xpath(‘//*[@id="J_isku"]/div/dl‘): b = i.xpath(‘./dt/text()‘).extract_first("") if ‘尺码‘ in b: aa = i.xpath(‘./dd/ul/li/a/span/text()‘).extract() ee = len(aa) dd = ‘ ‘.join(aa) sql = ‘update `商品id` set `商品id`.`尺码` = %s where id = %s‘ cursor.execute(sql, (dd, bd_id)) conection.commit() if ‘颜色‘ in b: a = i.xpath(‘./dd/ul/li/a/span/text()‘).extract() c = len(a) d = ‘ ‘.join(a) sql = ‘update `商品id` set `商品id`.`颜色` = %s where id = %s‘ cursor.execute(sql, (d, bd_id)) conection.commit() w = c * ee sql= ‘update `商品id` set `商品id`.`sku量` = %s where id = %s‘ cursor.execute(sql, (w,bd_id)) conection.commit() title = path + r‘\\‘ +shop.split(",")[0] + r‘\\‘ + ‘‘.join( re.findall(‘\d‘, i.xpath(‘./dd[1]/a/@href‘).extract_first(‘‘))) + re.sub("\W", "", webdriver.title) capture(webdriver, title + ‘.jpg‘) for i in shop_olxx: if i[‘id‘] == bd_id: sql = "UPDATE 商品id set " + i[‘日期‘] + " = (%s) where id = %s" cursor.execute(sql, (bd_liang, i[‘id‘])) conection.commit() if selects.xpath(‘//*[@id="J_ShopSearchResult"]/div/div[2]/div[last()]/a[last()]/@href‘).extract_first(""): lll(‘https:‘+selects.xpath(‘//*[@id="J_ShopSearchResult"]/div/div[2]/div[last()]/a[last()]/@href‘).extract_first("")) path=os.getcwd() def capture(webder, save_fn="capture.png"): # browser = webdriver.Ie() # Get local session of firefox # browser.get(url) # Load page webder.execute_script(""" (function () { var y = 0; var step = 100; window.scroll(0, 0); function f() { if (y < document.body.scrollHeight) { y += step; window.scroll(0, y); setTimeout(f, 50); } else { window.scroll(0, 0); document.title += "scroll-done"; } } setTimeout(f, 1000); })(); """) for i in range(30): if "scroll-done" in webder.title: break time.sleep(1) webder.save_screenshot(save_fn) with open(os.getcwd() + r‘\1.csv‘, ‘r‘) as c: for shop in c.readlines(): url = shop.split(",")[2] lll(url)
原文:http://www.cnblogs.com/gao-xiang/p/7228194.html