首页 > 编程语言 > 详细

python爬虫

时间:2017-03-04 15:14:59      阅读:437      评论:0      收藏:0      [点我收藏+]

这周学了一下利用python进行爬虫,因为只是为了提取数据,所以代码很粗糙,爬了一下同花顺财经、新浪财经和东方财富网中每一只股票的新闻以及研报

# -*- coding: utf-8 -*-
import urllib.request,re,time,random,gzip
from bs4 import BeautifulSoup

def get_stock_code():
    import tushare as ts
    df_stock = ts.get_sz50s()
    code_list = df_stock[code]
    return code_list


def saveFile(data,i):  
    path = "E:\\projects\\paper_"+str(i+1)+".txt"  
    file = open(path,wb)  
    page = +str(i+1)+\n  
    file.write(page.encode(gbk))  
    for d in data:  
        d = str(d)+\n  
        file.write(d.encode(gbk))  
    file.close()  
  
def ungzip(data):  
    try:  
        data = gzip.decompress(data)  
    except:  
        print("δ?????????????...")  
    return data  
  
#CSDN??????  
class CSDNSpider:  
    def __init__(self,pageIdx=1,url="http://blog.csdn.net/fly_yr/article/list/1"):  
        #??????  
        self.pageIdx = pageIdx  
        self.url = url[0:url.rfind(/) + 1] + str(pageIdx)  
        self.headers = {  
            #"Connection": "keep-alive",  
            User-Agent:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6
            #"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",  
            #"Accept-Encoding": "gzip, deflate, sdch",  
            #"Accept-Language": "zh-CN,zh;q=0.8",  
            #"Host": "data.eastmoney.com"  
        }  
        
    def readData_tonghuashun_report(self,urlweb,code):
        import os,sys,re
        ret=""
        req = urllib.request.Request(url=urlweb, headers=self.headers)
        res = urllib.request.urlopen(req)  
        data = res.read()  
        data = ungzip(data)  
        data = data.decode(utf-8)
        soup=BeautifulSoup(data,"html5lib") 
        info = (soup.find(div,kuaizhaolefttitle aDBlue nu))
        info = (info.find(p)).string.split( )
        time = info[0]
        source = info[1].split(机构:)[1]
        source = source.split(\xa0)[0]
        content = str(soup.find(div,kuaizhao_contant aDBlue nu))
        content = content.split(<br/><br/>)
        contenttemp = ‘‘
        for i in content:
            contenttemp = contenttemp+i+\n
        contenttemp = contenttemp.split(>)[1]
        contenttemp = contenttemp.split(<)[0]
        print(contenttemp)
        article = time+\n+source+\n+contenttemp
        return article
    
    def readData_tonghuashun_news(self,urlweb,code):
        import os,sys,re
        ret=""
        req = urllib.request.Request(url=urlweb, headers=self.headers)
        res = urllib.request.urlopen(req)  
        data = res.read()  
        data = ungzip(data)  
        soup=BeautifulSoup(data,"html5lib") 
        time = (soup.find(span,time)).string
        source = soup.find(span,{"id":source_baidu})
        source = (str(source.find(a))).split(>)
        source = (source[1]).split( )
        sourcetemp=‘‘
        for i in source:
            i = i.split(\n)
            for j in i:
                j = j.split(\t)
                for k in j:
                    if(k!=‘‘):
                        sourcetemp = k
                        break
                if(sourcetemp!=‘‘):
                    break
            if(sourcetemp!=‘‘):
                break
        source = sourcetemp
        content = soup.find(div,atc-content)
        contents = content.find_all(p)
        contenttemp = ‘‘
        content = ‘‘
        for i in contents:
            content=content +str(i)+\n
        for s in content:
            if (s==)or(s==)or(s==\n)or(s.isdigit()):
                contenttemp += s
                continue
            if u\u4e00 <= s <= u\u9fff:
                contenttemp += s
        article = time+\n+source+\n+contenttemp
        return article
#         pat_1 = re.compile(ur‘[\u4e00-\u9fa5]‘)
#         for item in re.findall(pat_1,content):
#             contenttemp+=item
#         print(contenttemp)
        
    
    def readData_sina_report(self,urlweb,code):  
        ret=""
        req = urllib.request.Request(url=urlweb, headers=self.headers)
        res = urllib.request.urlopen(req)  
  
        data = res.read()  
        data = ungzip(data)  
        #data = data.decode(‘utf-8‘)
        #print(data)
        soup=BeautifulSoup(data,"html5lib") 
        #items = soup.find_all(‘div‘,‘newsContent‘)
        infos = soup.find_all(div,creab)
        count = 0
        for info in infos:
            info = info.find_all(span) 
            for i in info:
                for j in i:
                    j.find(a)
                    if "" in j:
                        pass
                    else:
                        source = j.string
                    if "日期" in j:
                        time = j
                        time = time.split()[-1]
        
        
        contents = soup.find_all(p)
        contentAll = "" 
        for content in contents:
            contentAll+=str(content)
        print(contentAll)
    def readData_sina_news(self,urlweb,code):  
        ret=""
        req = urllib.request.Request(url=urlweb, headers=self.headers)
        res = urllib.request.urlopen(req)  
  
        data = res.read()  
        data = ungzip(data)  
        data = data.decode(utf-8)
        #print(data)
        soup=BeautifulSoup(data,"html5lib") 
        time = soup.find(span,time-source)
        time = (time.string).split( )
        print(time)
        count = 0
        for i in time:
            i = i.split(\n)
            for j in i:
                j = j.split(\t)
                for k in j:
                    if(k!="")and(count==0):
                        timetemp = k
                        count+=1
                    elif(k!="")and(count==1):
                        source = k
                        break
            
        time = timetemp
        contents = soup.find(div,article article_16)
        contents = contents.find_all(p)
        content = ‘‘
        for i in range(len(contents)-1):
            
            try:
                
                content+=contents[i+1].string+\n
            except:
                pass
        article = code+\n+time+\n+source+\n+content
        print(article)
        return article
    
    
    
    
    def readData_eastmoney_news(self,urlweb):  
        ret=""
        req = urllib.request.Request(url=urlweb, headers=self.headers)
        res = urllib.request.urlopen(req)  
        data = res.read()  
        data = ungzip(data)  
        data = data.decode(utf-8)
        soup=BeautifulSoup(data,"html5lib") 
        time = soup.find_all(div,time)
        #getTime
        time = time[0].string
        sources = soup.find(div,source)
        trs=sources.find("img")
        source = str(trs.attrs["alt"])
        contents = soup.find_all(p)
        contentStr = ‘‘
        for i in range(len(contents)):
            try:
                contentStr += contents[i].string
                contentStr += \n
            except:
                pass
        artical = time+\n+source+\n+contentStr
        return artical
    
    def readData_eastmoney_report(self,urlweb):  
        ret=""
        req = urllib.request.Request(url=urlweb, headers=self.headers)
        res = urllib.request.urlopen(req)  
        data = res.read()  
        data = ungzip(data)  
        #data = data.decode(‘utf-8‘)
        soup=BeautifulSoup(data,"html5lib") 
        infos = soup.find(div,report-infos)
        infos = infos.find_all(span)
        time = infos[1].string
        time = time.split( )
        timetemp = ""
        for i in time:
            i = i.split(\n)
            for j in i:
                j = j.split(\t)
                for k in j:
                    if(k!=""):
                        timetemp = k
                        break
                if(timetemp!=""):
                    break
            if(timetemp!=""):
                break
        time = timetemp
        source = infos[2].string
        source = source.split( )
        for i in source:
            i = i.split(\n)
            for j in i:
                j = j.split(\t)
                for k in j:
                    if(k!=""):
                        sourcetemp = k
                        break
        source = sourcetemp
        #getTime
        
        contents = soup.find_all(p)
        contentStr = ‘‘
        for i in range(len(contents)):
            try:
                contentStr += contents[i].string
                contentStr += \n
            except:
                pass
        artical = time+\n+source+\n+contentStr
        return artical
    
    
                 
                 
    def getAllUrl(self,url):
        import re
        import requests
        r = requests.get(url)
        data = r.text
        link_list =re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\‘).+?(?=\‘)" ,data)
        url_list = []
        for url in link_list:
            
            url_list.append(url)
        return url_list
    
    def getAllUrl2(self,url):
        cs = CSDNSpider()  
        url_list = []
        for i in cs.getAllUrl(http://finance.eastmoney.com/yaowen.html):
            url_list.append(i)
        for i in cs.getAllUrl(http://finance.eastmoney.com/pinglun.html):
            url_list.append(i)
        for i in cs.getAllUrl(http://stock.eastmoney.com/bidu.html):
            url_list.append(i)
        return url_list
            


def save_file_sina():
    import os
    import csv
    cs = CSDNSpider()
    code_list = get_stock_code()
    for code in code_list:
        code = str(code)
        URL="http://money.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol=sh"+code+"&Page=1"
        print(URL)
        url_list = cs.getAllUrl(URL)
        time = []
        source = []
        count = 0
        for i in url_list:
            #artical = cs.readData_sina(i,code)
            try:
                if search in i:
                    artical = cs.readData_sina_report(i,code)
                else:
                    artical = cs.readData_sina_news(i,code)
                artical = artical.split(\n)
                code = artical[0]
                timetemp = artical[1].split( )[0]
                timetemp = timetemp.split()[0]
                if "" in timetemp:
                    timetemp = timetemp.replace(,-)
                    timetemp = timetemp.replace(,-)
                sourcetemp = artical[2]
                
                path = abc/sina/+code+/+timetemp+/
        
                print(path)
                if timetemp in time:
                    if sourcetemp in source:
                        print(ss3)
                        csvFile = open(path+/+sourcetemp+.txt, "a")
                        writer = csv.writer(csvFile)
                        print(artical)
                        writer.writerow(artical)
                        csvFile.close()
                        print(end3)
                    else:
                        print(ss1)
                        source.append(sourcetemp)
                        csvfile = open(path+/+sourcetemp+.txt, w)
                        writer = csv.writer(csvfile)
                        print(artical)
                        writer.writerow(artical)
                        csvfile.close()
                        print(end1)
                else:
                    os.makedirs(path)
                    csvfile = open(path+/+sourcetemp+.txt, w)
                    print(ss2)
                    time.append(timetemp)
                    #csvfile = open(path, ‘w‘)
                    writer = csv.writer(csvfile)
                    print(artical)
                    writer.writerow(artical)
                    csvfile.close()
                    print(end2)
                print(count)
                count = count+1
            except:
                pass
            




def save_file_eastmoney():
    import os
    import csv
    cs = CSDNSpider()  
    #cs.readData2()
    code_list = get_stock_code()
    for code in code_list:
        code = str(code)
        url = "http://quote.eastmoney.com/sh"+code+".html"
        url_list = cs.getAllUrl(url)
        time = []
        mechanism = []
        count = 0
        for url in url_list:
            try:        
                if "news" in url:
                    artical = cs.readData_eastmoney_news(url)
                elif "report" in url:
                    artical = cs.readData_eastmoney_report(url)
                else:
                    continue                
                artical = artical.split(\n)
                timetemp = artical[0].split()[0]
                if "" in timetemp:
                    timetemp = timetemp.replace(,-)
                    timetemp = timetemp.replace(,-)
                mechanismtemp = artical[1]
                path = abc/eastmoney/+code+/+timetemp
            
                print(path)
                if timetemp in time:
                    if mechanismtemp in mechanism:
                        print(ss3)
                        csvFile = open(path+/+mechanismtemp+.txt, "a")
                        writer = csv.writer(csvFile)
                        print(artical)
                        writer.writerow(artical)
                        csvFile.close()
                        print(end3)
                    else:
                        print(ss1)
                        mechanism.append(mechanismtemp)
                        csvfile = open(path+/+mechanismtemp+.txt, w)
                        writer = csv.writer(csvfile)
                        print(artical)
                        writer.writerow(artical)
                        csvfile.close()
                        print(end1)
                else:
                    os.makedirs(path)
                    csvfile = open(path+/+mechanismtemp+.txt, w)
                    print(ss2)
                    time.append(timetemp)
                    #csvfile = open(path, ‘w‘)
                    writer = csv.writer(csvfile)
                    print(artical)
                    writer.writerow(artical)
                    csvfile.close()
                    print(end2)
                print(count)
                count = count+1
            except:
                pass
cs = CSDNSpider()
cs.readData_tonghuashun_report("http://search.10jqka.com.cn/snapshot/report_pdf/ea02eda3880f930e.html",100000)
#cs.readData_sina_news(‘http://cj.sina.com.cn/article/detail/5966752440/177997‘,‘60000‘)
#cs.readData_sina2(‘http://finance.sina.com.cn/stock/hyyj/2017-02-28/doc-ifyavvsk3874186.shtml‘, ‘10000‘)
#cs.readData_sina(‘http://finance.sina.com.cn/stock/hyyj/2016-11-23/doc-ifxxwrwk1751619.shtml‘)

 

python爬虫

原文:http://www.cnblogs.com/yunerlalala/p/6501397.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!