首页 > 其他 > 详细

食品伙伴网爬虫

时间:2019-03-19 20:32:50      阅读:138      评论:0      收藏:0      [点我收藏+]

常规爬虫,就是下载pdf文件

码云链接:https://gitee.com/MarkPolaris/food_partnership_network/tree/master

概览页

 1 import requests
 2 import re
 3 import pymysql
 4 import hashlib
 5 import datetime
 6 
 7 
 8 class GLY(object):
 9     def __init__(self):
10         self.url = http://down.foodmate.net/special/standard/8.html
11         self.headers = {
12             User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36
13         }
14         self.host = 127.0.0.1
15         self.db = app_mark
16         self.user = root
17         self.passwd = 123456
18         self.charset = utf8mb4
19 
20     def get_url(self):
21         response = requests.get(self.url, headers=self.headers)
22         response.encoding = response.apparent_encoding
23         html = response.text
24         urls = re.findall(<A title=.*?href="(.*?)", html)
25         # 去重
26         urls = set(urls)
27         for url in urls:
28             hkey = hashlib.md5(url.encode(encoding=utf-8)).hexdigest()
29             tag = 0
30             channel = 食品添加剂标准
31             sitename = 食品伙伴网
32             lasttime = datetime.datetime.now().strftime(%Y-%m-%d %H:%M:%S)
33             list_data = [url, hkey, tag, channel, sitename, lasttime]
34             self.save_url(list_data)
35         print(len(urls))
36 
37     def save_url(self, list_data):
38         con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset)
39         cur = con.cursor()
40         sql = insert into gly(link, hkey, tag, channel, sitename, lasttime) values (%s, %s, %s, %s, %s, %s)
41         try:
42             cur.execute(sql, list_data)
43             print(insert success)
44         except Exception as e:
45             con.rollback()
46             print(error~, e)
47         else:
48             con.commit()
49         cur.close()
50         con.close()
51         
52 
53 
54 if __name__ == __main__:
55     gly = GLY()
56     urls = gly.get_url()

细览页

 1 import pymysql
 2 import re
 3 import datetime
 4 import requests
 5 from multiprocessing.dummy import Pool as ThreadPool
 6 
 7 class XLY(object):
 8     def __init__(self):
 9         self.host = 127.0.0.1
10         self.db = app_mark
11         self.user = root
12         self.passwd = 123456
13         self.charset = utf8mb4
14         self.headers = {
15             User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36
16         }
17         self.start = datetime.datetime.now()
18 
19     def get_urls(self):
20         con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset)
21         cur = con.cursor()
22         sql = select link from gly where tag = "0" and sitename = "食品伙伴网"
23         after_sql = update gly set tag = "1"
24         try:
25             cur.execute(sql)
26             results = cur.fetchall()
27         except Exception as e:
28             con.rollback()
29             print(error~, e)
30             results = None
31         else:
32             con.commit()
33         cur.close()
34         con.close()
35         return results
36 
37     def download(self, url):
38         url = url[0]
39         response = requests.get(url, headers=self.headers)
40         response.encoding = response.apparent_encoding
41         html = response.text
42         down_url = re.findall(<a class="telecom" href="(.*?)">, html, re.S)
43         try:
44             down_url = down_url[0]
45             r = requests.get(down_url, headers=self.headers)
46             file_name = D:/1_work/python采集/PDF/ + down_url.split(auth=)[-1] + .pdf
47             # print(file_name)  
48             with open(file_name, wb) as pdf:
49                 for content in r.iter_content():
50                     pdf.write(content)
51         except Exception as e:
52             print(error_url:{}; exception: {}.format(url, e))
53         print(down_url)
54 
55 
56 if __name__ == __main__:
57     xly = XLY()
58     urls = xly.get_urls()
59     if urls:
60         # 多线程
61         pool = ThreadPool(20)
62         pool.map(xly.download, urls)
63         pool.close()
64         pool.join()
65     end = datetime.datetime.now()
66     print(耗时: {}.format(end - xly.start))
67         # for url in urls:
68             # url = url[0]
69             # xly.download(url)
70             # break

 

食品伙伴网爬虫

原文:https://www.cnblogs.com/MC-Curry/p/10561068.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!