完成的目标:
输入搜索的商品 以及 淘宝的已评价数目、店铺的商品描述(包括如实描述、服务态度、快递的5.0打分);
按要求,晒选出要求数量的结果,并按“物美价廉算法”排序后输出
思路:
1,利用淘宝搜索‘https://s.taobao.com/search?‘的价格filter 先进行价格筛选,得到结果的网站
2,用urllib打开结果网站,构造正则表达式匹配出各个商品结果的 价格、已评价数量、店铺的如实描述等信息;
并把结果保存至二维数组里。
3,利用商品及店铺信息,用“物美价廉算法”给各个商品打分
4,按打分排序, 各个信息总结果按排序输出到新建txt文件里;
并将各个商品图片下载到文件及建立相同排序开头的txt(其名字包好简要的商品信息),这样图片和商品信息同时能在一个文件夹里用大图排列看到。
5.,可以把输入的参数(价格范围等要求)以函数输入,,用pyinstaller 把整个py程序打包为EXE 就可以发布了。
源代码如下
1 # -*- coding: utf-8 -*- 2 import urllib 3 import urllib2 4 import re 5 import time 6 import random 7 import os 8 from math import log 9 from math import log10 10 from math import sqrt 11 import sys 12 13 reload(sys) 14 sys.setdefaultencoding(‘utf8‘) 15 16 17 class counter(object): 18 #计数器 19 def __init__(self): 20 self.count = 0 21 self.try_time = 0 22 self.fail_time = 0 23 self.url_list = [] 24 self.new_flag = True 25 self.results=[] 26 self.p=0 27 self.d=0 28 def print_counter(self): 29 print ‘try_time:‘, self.try_time, " get_count:" , self.count, " fail_time:",self.fail_time 30 counter1=counter() 31 32 def post_request(url): 33 #构造随机头部文件访问请求 34 User_Agents=["Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0",# 35 "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36", 36 "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", # 37 "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", 38 "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11O" 39 ] 40 random_User_Agent = random.choice(User_Agents) 41 #print random_User_Agent 42 43 req =urllib2.Request(url) #!! 44 45 req.add_header("User-Agent",random_User_Agent) 46 req.add_header("GET",url) 47 req.add_header("Referer",url) 48 return req 49 50 def recommend_rate(price,description,delivery,service,comments): 51 #描述为绝对值 52 av_p=counter1.p/counter1.count 53 av_d=counter1.d/counter1.count 54 rate=(description/av_d)**20*(description+delivery+service)*(av_p/(price))**0.1+log((comments+5),1000) 55 print ‘all count=‘,counter1.count 56 print "avrage price=",av_p,‘;‘,av_p/(price),‘;price‘,price,‘;comments=‘,comments,‘;descrip=‘,description 57 print ‘rate=‘,rate,‘(price)yinzi‘,(av_p/(price))**0.1,‘descrip_yinzi‘,(description/av_d)**20,‘comments_factor=‘,log((comments+50),100) 58 return rate 59 60 def product_rank(list): 61 for x in list: 62 #0开始为 x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况、x7描述相符情况3个、x8服务情况 63 rate=recommend_rate(x[3],x[7],x[6],x[8],x[4]) 64 x.append(rate) 65 66 def get_user_rate(item_url): 67 ‘‘‘获取卖家信用情况;未登录情况不能访问。。。‘‘‘ 68 html=urllib2.urlopen(item_url) 69 #"//rate.taobao.com/user-rate-282f910f3b70f2128abd0ee9170e6428.htm" 70 regrex_rate=‘"(//.*?user\-rate.*?)"‘ 71 codes= re.findall(regrex_rate,html.read()) 72 html.close() 73 74 user_rate_url= ‘http:‘+codes[0] 75 print ‘uu‘, user_rate_url 76 77 user_rate_html = urllib2.urlopen(user_rate_url) 78 print user_rate_html.read() 79 #title="4.78589分" 80 desc_regex=u‘title="(4.[0-9]{5}).*?‘ 81 de_pat=re.compile(desc_regex) 82 83 descs = re.findall(de_pat,user_rate_html.read()) 84 print len(descs) 85 item_url=‘https://item.taobao.com/item.htm?id=530635294653&ns=1&abbucket=0#detail‘ 86 #get_user_rate(item_url) 87 ‘‘‘获取卖家信用情况;未登录情况不能访问。。。暂时 无用‘‘‘ 88 89 def get_praised_good(url,file_open,keyword,counts): 90 #从给定的淘宝链接中 获取符合条件的商品list 91 html=urllib2.urlopen(post_request(url)) 92 code=html.read() 93 html.close() 94 95 regrex2=ur‘raw_title":"(.*?)","pic_url":"(.*?)","detail_url":"(.*?)","view_price":"(.*?)".*?"comment_count":"(.*?)".*?"nick":"(.*?)".*?"delivery":\[(.*?),(.*?),(.*?)\],"description":\[(.*?),(.*?),(.*?)\],"service":\[(.*?),(.*?),(.*?)\]‘ 96 #每一个匹配项 返回 15个 字符串 97 #x[0]开始为 x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况3个、x9描述相符情况3个、x12服务情况3个 98 #x3,x6,x9,x12,x4 99 pat=re.compile(regrex2) 100 meet_code=re.findall(regrex2,code)# 101 102 for x in meet_code: 103 if counter1.count>=counts : 104 print "have get enough pruducts" 105 break 106 description_higher=int(x[10])*float(x[11])/100 107 service_higher=int(x[13])*float(x[14])/100 108 try: 109 x4=int(x[4])#description_count 110 except: 111 x4=0 112 if (description_higher>=description_higher_require) and (service_higher>=service_higher_require) and x4>=description_count_require: 113 if re.findall(keyword,x[0]) :#(): 114 #try: 115 detail_url=‘http:‘+x[2].decode(‘unicode-escape‘).encode(‘utf-8‘) 116 x1=‘http:‘+x[1].decode(‘unicode-escape‘).encode(‘utf-8‘) 117 #print type(x) 118 if detail_url in counter1.url_list: 119 counter1.new_flag=False 120 print ‘no more new met products‘ 121 print counter1.url_list 122 print detail_url 123 break 124 counter1.url_list.append(detail_url) 125 counter1.try_time+=1 126 counter1.count+=1 127 130 x11=float(x[11])/100 131 x9=float(x[9])/100 132 x12=float(x[12])/100 133 x6=float(x[6])/100 134 x3=float(x[3]) 135 counter1.p+=x3 136 counter1.d+=x9 137 x5=unicode(x[5],‘utf-8‘) 138 139 140 result_list=[] 141 result_list.append(x[0]) 142 result_list.append(x1) 143 result_list.append(detail_url) 144 result_list.append(x3) 145 result_list.append(x4) 146 result_list.append(x5) 147 result_list.append(x6) 148 #result_list.append(x7) 149 result_list.append(x9) 150 result_list.append(x12) 151 #result_list.append(rate) 152 #0开始为 x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况、x7描述相符情况、x8服务情况 153 counter1.results.append(result_list) 154 ‘‘‘ 155 txt_name=reserve_file_path+‘\\‘+ ‘%s__‘%counter1.count+ ‘custome_description_%s __‘%x9 +‘__comments_%s_‘%x[4]+ ‘___price_%srmb___‘%x[3] +x5 +‘.txt‘ 156 157 file_o=open(txt_name,‘a‘) 158 file_o.write(detail_url) 159 file_o.close() 160 161 print ‘\nget_one_possible_fine_goods:\n‘,‘good_name:‘,x[0].decode(‘utf-8‘) 162 print ‘price:‘,x[3].decode(‘utf-8‘),x[5].decode(‘utf-8‘) 163 print ‘custome_description:‘,x9,‘--‘,x11,‘%higher than average;‘,‘described_nomber:‘,x[4],‘ service:‘,float(x[12])/100 164 print detail_url.decode(‘utf-8‘),‘\ngood_pic_url:‘,x1.decode(‘utf-8‘) 165 166 print txt_name 167 168 file_open.write(‘%s__‘%counter1.count +x[0]+‘\nprice:‘+x[3]+‘¥,‘+‘\n‘+detail_url+‘ \n‘+x[5]+‘\ncustomer_description:‘+str(x9)+‘; ‘+str(x11)+‘%higher than average described_number:‘+x[4]+‘\n\n\n‘) 169 170 print ‘get one -^-‘ 171 ‘‘‘ 172 #except: 173 #print ‘lose one picture‘ 174 #counter1.fail_time+=1 175 176 177 def save_downpic(lis,file_open,reserve_file_path): 178 ‘‘‘从商品list下载图片到reserve_file_path,并写入信息至fileopen‘‘‘ 179 #0开始为 x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况、x7描述相符情况、x8服务情况、x9:rate 180 len_list=len(lis) 181 print len_list 182 cc=0 183 for x in lis: 184 185 try: 186 urllib.urlretrieve(x[1],reserve_file_path+‘\\%s___‘%cc +unicode(x[0],‘utf-8‘)+‘.jpg‘) 187 188 txt_name=reserve_file_path+‘\\‘+ ‘%s__‘%cc+ ‘custome_description_%s __‘%x[7] +‘__comments_%s_‘%x[4]+ ‘___price_%srmb___‘%x[3] +x[5] +‘.txt‘ 189 190 file_o=open(txt_name,‘a‘) 191 file_o.write(x[2]) 192 file_o.close() 193 194 print ‘\nget_one_possible_fine_goods:\n‘,‘good_name:‘,x[0].decode(‘utf-8‘) 195 print ‘rate=‘,x[9] 196 print ‘price:‘,x[3],x[5]#.decode(‘utf-8‘) 197 print ‘custome_description:‘,x[7],‘--‘,‘described_number:‘,x[4],‘ service:‘,x[8] 198 print x[2].decode(‘utf-8‘),‘\ngood_pic_url:‘,x[1].decode(‘utf-8‘) 199 200 print txt_name 201 print cc+1,"th" 202 203 file_open.write(u‘%s__‘%cc +u‘%s‘%x[0]+‘\nprice:‘+str(x[3])+‘¥,‘+‘\n‘+str(x[2])+‘ \n‘+str(x[5])+‘\ncustomer_description:‘+str(x[7])+‘described_number:‘+str(x[4])+‘\n\n\n‘) 204 205 print ‘get one -^-‘ 206 except : 207 pass 208 cc+=1 209 time.sleep(0.5) 210 211 def get_all_praised_goods(reserch_goods,counts,reserve_file_path,price_min,price_max): 212 #边里搜索结果每一页 213 #initial url and page number 214 initial_url=‘https://s.taobao.com/search?q=‘+reserch_goods+‘&filter=reserve_price%5B‘+‘%s‘%price_min+‘%2C‘+‘%s‘%price_max+‘%5D&s=‘ 215 print "initial_url",initial_url 216 page_n=0 217 reserve_file=reserve_file_path+r‘\found_goods.txt‘ 218 file_open=open(reserve_file,‘a‘) 219 220 file_open.write(‘****************************\n‘) 221 file_open.write(time.ctime()) 222 file_open.write(‘\n****************************\n‘) 223 224 while counter1.new_flag and counter1.count<counts : 225 226 url_1=initial_url+‘%s‘%(44*page_n) 227 #print initial_url 228 print ‘url_1:‘, url_1 229 #print ‘ss‘,initial_url+‘%s‘%(44*page_n) 230 page_n+=1 231 232 get_praised_good(url_1,file_open,keyword,counts) 233 time.sleep(2) 234 # except: 235 print "%s"%page_n,"pages have been searched" 236 if page_n>=11 : 237 print "check keyword,maybe too restrict" 238 break 239 print url_1 240 product_rank(counter1.results) 241 242 counter1.results.sort(key=lambda x :x[9],reverse=True) 243 244 save_downpic(counter1.results,file_open,reserve_file_path) 245 246 file_open.close() 247 counter1.print_counter() 248 249 if __name__=="__main__": 250 251 reserch_goods=‘英伦男外套‘ #淘宝搜索词 252 keyword=‘‘#raw_input().decode("gbk").encode("utf-8") #个人限定词,商品名字必须包含,防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制 253 price_min=22 #价格区间 254 price_max=300 255 description_higher_require=0 # % 默认高于average, 输出结果大于此值 256 service_higher_require=0 # % 默认高于average, 输出结果大于此值 257 description_count_require=6 258 259 counts=50 260 261 262 reserve_file_path=r"C:\Users\Administrator\Desktop\Python scrapy\find_worthy_goods\results"#结果保存路径 263 while os.path.exists(reserve_file_path): 264 reserve_file_path =reserve_file_path+‘%s‘%random.randrange(0,100) 265 if not os.path.exists(reserve_file_path): 266 os.makedirs(reserve_file_path) 267 268 269 get_all_praised_goods(reserch_goods,counts,reserve_file_path,price_min,price_max) 270 271 #print counter1.results 272 # 273 #print counter1.results 274 275 ‘‘‘ 以下用输入函数输入要求的刷选条件 276 print ‘说明:\n本程序用于在淘宝上搜索商品时主动通过 价格范围、商品描述、服务态度、评论数来筛选商品;\n筛选出来的商品图片下载保存到磁盘(默认桌面新建find_worty_goods文件夹)并建立同序号开头的txt文件,图片显示商品,其旁的txt文件名显示价格等关键信息,txt里保存商品的淘宝链接‘.decode(‘utf-8‘) 277 print "please input reserch _goods_name" 278 print "请输入搜索商品名称;注意不能有空格,下同".decode(‘utf-8‘) 279 reserch_goods=raw_input() #淘宝搜索词 280 if reserch_goods: 281 # # 282 # print "please input _keyword that goods name must include:\n(more than one keyword must use Regular Expression); default by no kewords" 283 # try: 284 # keyword=raw_input().decode("gbk").encode("utf-8") #个人限定词,商品名字必须包含,防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制 285 # except: 286 # keyword=‘青‘ 287 # # 288 keyword=‘.‘ 289 print "\nplease input _minimal price and _maximal price; \ndefault by 0,10000\nnext by ‘enter‘key input nothing means by default,the same below " 290 print ‘请输入价格范围 ;默认0-10000 ;两项用半角逗号","分隔 按回车键确认;什么也不输入代表使用默认值 ‘.decode(‘utf-8‘) 291 try: 292 price_min, price_max=input() 293 except: 294 print ‘not input or wrong number,use default range‘ 295 price_min, price_max = 0 ,10000 296 # 297 print "\nplease input _description_higher_percent_require and _service_higher__percent_require\n range:(-100,100) ; \ndefault by 0,0 I.e better than average" 298 print ‘请输入商品描述、服务高于平均值的百分比-100 ~100‘.decode(‘utf-8‘) 299 # % 默认高于average, 输出结果大于此值 300 try: 301 description_higher_require,service_higher_require=input() 302 except: 303 print ‘not input or wrong number,use default range‘ 304 description_higher_require = 0 # % 默认高于average, 输出结果大于此值 305 service_higher_require = 0 306 # 307 print "\nplease input description count limit, default more than 1" 308 print ‘输入最低商品评价数,默认大于1‘.decode(‘utf-8‘) 309 try: 310 description_count_require=input() 311 except : 312 print ‘not input or wrong number,use default range‘ 313 description_count_require=1 314 # 315 316 print "\nIF customise file reserve path, Y or N \ndefault/sample as: C:\\Users\\Administrator\\Desktop\\find_worthy_goods\\results " 317 print ‘是否自定义保存文件目录 Y or N‘.decode(‘utf-8‘) 318 if raw_input()==‘Y‘: 319 print "please input path that you want to reserve; \n " 320 reserve_file_path = raw_input() 321 else: 322 reserve_file_path=r"C:\Users\Administrator\Desktop\find_worthy_goods\results"#结果保存路径 323 324 while os.path.exists(reserve_file_path): 325 reserve_file_path = reserve_file_path+‘%s‘%random.randrange(1,10) 326 #print "the path exist,we‘ll make a new one" 327 try: 328 os.makedirs(reserve_file_path) 329 print ‘ok,file_path we reserve results: %s‘%reserve_file_path 330 print ‘保存的路径为:‘.decode(‘utf-8‘) 331 except: 332 print "failed to make file path\nplease restart program" 333 print ‘创建文件夹失败,请重新启动程序‘.decode(‘utf-8‘) 334 335 # 336 print "\nplease input how many results you want, default by 50" 337 print ‘您要获取的商品数目,默认50‘.decode(‘utf-8‘) 338 try: 339 counts=input() 340 except : 341 counts=50 342 # 343 344 get_all_praised_goods(reserch_goods,counts,reserve_file_path,price_min,price_max) 345 print ‘\n‘ 346 counter1.print_counter() 347 print "finished,please look up in %s"%reserve_file_path 348 print ‘下载完成‘.decode(‘utf-8‘) 349 350 print counter1.results 351 input() 352 else: 353 print "no search goods" 354 print ‘没有输入商品名称‘.decode(‘utf-8‘) 355 ‘‘‘ 356 #下一步保存图片,以文件名为商品图片名字,并以序号开头 357 #同时,输出 价格、商家名,商品描述、服务等 到 txt文本 358 #在商品图片看中后,便可按序号查找 359 #按描述、服务评价高于平均,购物体验应该可以的
原文:http://www.cnblogs.com/willowj/p/6238406.html