首页 > 编程语言 > 详细

python 爬虫 requests+BeautifulSoup 爬取巨潮资讯公司概况代码实例

时间:2018-07-18 23:59:24      阅读:396      评论:0      收藏:0      [点我收藏+]

第一次写一个算是比较完整的爬虫,自我感觉极差啊,代码low,效率差,也没有保存到本地文件或者数据库,强行使用了一波多线程导致数据顺序发生了变化。。。

贴在这里,引以为戒吧。

# -*- coding: utf-8 -*-
"""
Created on Wed Jul 18 21:41:34 2018
@author: brave-man
blog: http://www.cnblogs.com/zrmw/
"""

import requests
from bs4 import BeautifulSoup
import json
from threading import Thread

# 获取上市公司的全称,英文名称,地址,法定代表人(也可以获取任何想要获取的公司信息)
def getDetails(url): headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0"} res = requests.get("{}".format(url), headers = headers) res.encoding = "GBK" soup = BeautifulSoup(res.text, "html.parser") details = {"code": soup.select(".table")[0].td.text.lstrip("股票代码:")[:6], "Entire_Name": soup.select(".zx_data2")[0].text.strip("\r\n "), "English_Name": soup.select(".zx_data2")[1].text.strip("\r\n "), "Address": soup.select(".zx_data2")[2].text.strip("\r\n "), "Legal_Representative": soup.select(".zx_data2")[4].text.strip("\r\n ")} # 这里将details转换成json字符串格式用作后期存储处理 jd = json.dumps(details) jd1 = json.loads(jd) print(jd1)
# 此函数用来获取上市公司的股票代码
def getCode(): headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0"} res = requests.get("http://www.cninfo.com.cn/cninfo-new/information/companylist", headers = headers) res.encoding = "gb1232" soup = BeautifulSoup(res.text, "html.parser") # print(soup.select(".company-list")) L = [] l1 = [] l2 = [] l3 = [] l4 = [] for i in soup.select(".company-list")[0].find_all("a"): code = i.text[:6] l1.append(code) for i in soup.select(".company-list")[1].find_all("a"): code = i.text[:6] l2.append(code) for i in soup.select(".company-list")[2].find_all("a"): code = i.text[:6] l3.append(code) for i in soup.select(".company-list")[3].find_all("a"): code = i.text[:6] l4.append(code) L = [l1, l2, l3, l4] print(L[0]) return getAll(L) def getAll(L): def t1(L): for i in L[0]: url_sszb = "http://www.cninfo.com.cn/information/brief/szmb{}.html".format(i) getDetails(url_sszb) def t2(L): for i in L[1]: url_zxqyb = "http://www.cninfo.com.cn/information/brief/szsme{}.html".format(i) getDetails(url_zxqyb) def t3(L): for i in L[2]: url_cyb = "http://www.cninfo.com.cn/information/brief/szcn{}.html".format(i) getDetails(url_cyb) def t4(L): for i in L[3]: url_hszb = "http://www.cninfo.com.cn/information/brief/shmb{}.html".format(i) getDetails(url_hszb) # tt1 = Thread(target = t1, args = (L, )) # tt2 = Thread(target = t2, args = (L, )) # tt3 = Thread(target = t3, args = (L, )) # tt4 = Thread(target = t4, args = (L, )) # # tt1.start() # tt2.start() # tt3.start() # tt4.start() # # tt1.join() # tt2.join() # tt3.join() # tt4.join() t1(L) t2(L) t3(L) t4(L) if __name__ == "__main__": getCode()

没有考虑实际生产中突发的状况,比如网速延迟卡顿等问题。

速度是真慢,有时间会分享给大家 selenium + 浏览器 的爬取巨潮资讯的方法代码。晚安~

python 爬虫 requests+BeautifulSoup 爬取巨潮资讯公司概况代码实例

原文:https://www.cnblogs.com/zrmw/p/9333385.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!