爬百度新闻

时间：2017-10-18 22:58:19 阅读：267 评论：0 收藏：0 [点我收藏+]
#################################
#data:2017-10-1
#version:1.0
# -*- coding:utf-8 -*-
import threading
import re
import urllib2
import chardet
from BeautifulSoup import BeautifulSoup
import time
import os
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
class myThreads(threading.Thread):
    def __init__(self,threadname,filename):
        threading.Thread.__init__(self)
        self.threadname=threadname
        self.filename=filename
    def run(self):
        print "Starting download url:",self.threadname
        extract_news_content(self.threadname,self.filename)
        time.sleep(2)
        print "Exiting " + self.threadname
def remove_js_css(content):
    """ remove the the javascript and the stylesheet and the comment content (<script>....</script> and <style>....</style> <!-- xxx -->) """
    r = re.compile(r‘‘‘<script.*?</script>‘‘‘, re.I | re.M | re.S)
    s = r.sub(‘‘, content)
    r = re.compile(r‘‘‘<style.*?</style>‘‘‘, re.I | re.M | re.S)
    s = r.sub(‘‘, s)
    r = re.compile(r‘‘‘<!--.*?-->‘‘‘, re.I | re.M | re.S)
    s = r.sub(‘‘, s)
    r = re.compile(r‘‘‘<meta.*?>‘‘‘, re.I | re.M | re.S)
    s = r.sub(‘‘, s)
    r = re.compile(r‘‘‘<ins.*?</ins>‘‘‘, re.I | re.M | re.S)
    s = r.sub(‘‘, s)
    return s
def remove_empty_line(content):
    """remove multi space """
    r = re.compile(r‘‘‘^\s+$‘‘‘, re.M | re.S)
    s = r.sub(‘‘, content)
    r = re.compile(r‘‘‘\n+‘‘‘, re.M | re.S)
    s = r.sub(‘\n‘, s)
    return s
def remove_any_tag(s):
    s = re.sub(r‘‘‘<[^>]+>‘‘‘, ‘‘, s)
    return s.strip()
def remove_any_tag_but_a(s):
    text = re.findall(r‘‘‘<a[^r][^>]*>(.*?)</a>‘‘‘, s, re.I | re.S | re.S)
    text_b = remove_any_tag(s)
    return len(‘‘.join(text)), len(text_b)
def remove_image(s, n=50):
    image = ‘a‘ * n
    r = re.compile(r‘‘‘<img.*?>‘‘‘, re.I | re.M | re.S)
    s = r.sub(image, s)
    return s
def remove_video(s, n=1000):
    video = ‘a‘ * n
    r = re.compile(r‘‘‘<embed.*?>‘‘‘, re.I | re.M | re.S)
    s = r.sub(video, s)
    return s
def sum_max(values):
    cur_max = values[0]
    glo_max = -999999
    left, right = 0, 0
    for index, value in enumerate(values):
        cur_max += value
        if (cur_max > glo_max):
            glo_max = cur_max
            right = index
        elif (cur_max < 0):
            cur_max = 0
    for i in range(right, -1, -1):
        glo_max -= values[i]
        if abs(glo_max < 0.00001):
            left = i
            break
    return left, right + 1
def method_1(content, k=1):
    if not content:
        return None, None, None, None
    tmp = content.split(‘\n‘)
    group_value = []
    for i in range(0, len(tmp), k):
        group = ‘\n‘.join(tmp[i:i + k])
        group = remove_image(group)
        group = remove_video(group)
        text_a, text_b = remove_any_tag_but_a(group)
        temp = (text_b - text_a) - 8
        group_value.append(temp)
    left, right = sum_max(group_value)
    return left, right, len(‘\n‘.join(tmp[:left])), len(‘\n‘.join(tmp[:right]))
def extract(content):
    content = remove_empty_line(remove_js_css(content))
    left, right, x, y = method_1(content)
    return ‘\n‘.join(content.split(‘\n‘)[left:right])
# 输入url，将其新闻页的正文输入txt
def extract_news_content(web_url, file_name):
    html=""
    request = urllib2.Request(web_url)
    # 在请求加上头信息，伪装成浏览器访问
    request.add_header(‘User-Agent‘,
                       ‘Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6‘)
    opener = urllib2.build_opener()
    try:
        html = opener.open(request).read()
    except urllib2.HTTPError, e:
        print e.code
    except urllib2.URLError, e:
        print e.reason
    infoencode = chardet.detect(html)[‘encoding‘]  ##通过第3方模块来自动提取网页的编码
    if html != None and infoencode != None:  # 提取内容不为空，error.或者用else
        html = html.decode(infoencode, ‘ignore‘)
        soup = BeautifulSoup(html)
        content = soup.renderContents()
        content_text = extract(content)  # 提取新闻网页中的正文部分，化为无换行的一段文字
        content_text = re.sub(" ", " ", content_text)
        content_text = re.sub("&gt;", "", content_text)
        content_text = re.sub("&quot;", ‘""‘, content_text)
        content_text = re.sub("<[^>]+>", "", content_text)
        content_text = re.sub("\n", "", content_text)
        file = open(file_name, ‘a‘)  # append
        file.write(content_text)
        file.close()
# 抓取百度新闻搜索结果:中文搜索，前10页，url：key=关键词
def search(key):
    search_url = ‘http://news.baidu.com/ns?word=key_word&tn=news&from=news&cl=2&rn=20&ct=1‘
    req2 = urllib2.Request(search_url.replace(‘key_word‘, key))
    req2.add_header(‘User-Agent‘,
                       ‘Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6‘)
    req = urllib2.urlopen(req2, timeout=10)
    real_visited = 0
    threads=[]
    for count in range(50):  # 前10页
        html = req.read()
        soup = BeautifulSoup(html)
        content = soup.findAll("div", {"class": "result"})  # resultset object
        num = len(content)   #每页20行数据
        print "data:",num,count
        file_dir = r"E:\\Python27\\newscn\\%s" % (key.encode(‘gb2312‘))
        if not os.path.exists(file_dir):
            os.mkdir(file_dir)
        for i in range(num):
            # 先解析出来所有新闻的标题、来源、时间、url
            p_str = content[i].find(‘a‘)  # if no result then nontype object
            contenttitle = p_str.renderContents()
            contenttitle = contenttitle.decode(‘utf-8‘, ‘ignore‘)  # need it
            contenttitle = re.sub("<[^>]+>", "", contenttitle)
            contentlink = str(p_str.get("href"))
            # 存放顺利抓取的url，对比
            visited_url = open(r‘E:\\Python27\\visited-cn.txt‘, ‘r‘)  # 是否已经爬过
            visited_url_list = visited_url.readlines()
            visited_url.close()  # 及时close
            exist = 0
            for itme in visited_url_list:
                if itme.strip(‘\n‘) == contentlink:
                    exist=1
                    continue
            print "url:",contentlink,"status",exist
            if exist != 1:  # 如果未被访问url
                p_str2 = content[i].find(‘p‘).renderContents()
                contentauthor = p_str2[:p_str2.find(" &nbsp")]  # 来源
                contentauthor = contentauthor.decode(‘utf-8‘, ‘ignore‘)  # 时
                contenttime = p_str2[p_str2.find(" &nbsp") + len(" &nbsp") + 1:]
                contenttime = contenttime.decode(‘utf-8‘, ‘ignore‘)
                real_visited += 1
                file_name = r"E:\\Python27\\newscn\\%s\\%d.txt" % (key.encode(‘gb2312‘),real_visited)
                file = open(file_name, ‘w‘)
                file.write(contenttitle.encode(‘utf-8‘))
                file.write(u‘\n‘)
                file.write(contentauthor.encode(‘utf-8‘))
                file.write(u‘\n‘)
                file.write(contenttime.encode(‘utf-8‘))
                file.write(u‘\n‘ + contentlink + u‘\n‘)
                file.close()
                threadnew=myThreads(contentlink,file_name)
                threads.append(threadnew)
#               extract_news_content(contentlink, file_name)  # 还写入文件
                visited_url_list.append(contentlink)  # 访问之
                visited_url = open(r‘E:\\Python27\\visited-cn.txt‘, ‘a‘)  # 标记为已访问，永久存防止程序停止后丢失
                visited_url.write(contentlink + u‘\n‘)
                visited_url.close()
            if len(visited_url_list) >= 120:
                break
                # 解析下一页
        print "page:",count,"url address:",visited_url_list
        if len(visited_url_list) >= 240:# 最多12页
            break
        if count == 0:
            next_num = 0
        else:
            next_num = 1
        try:
            next_page = ‘http://news.baidu.com‘ + soup(‘a‘, {‘href‘: True, ‘class‘: ‘n‘})[next_num][ ‘href‘]  # search for the next page#翻页
        except:
            break
        req2 = urllib2.Request(next_page)
        req2.add_header(‘User-Agent‘,
                        ‘Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6‘)
        req=urllib2.urlopen(req2, timeout=10)
    for th in threads:
        th.setDaemon(True)
        th.start()
    for th in threads:
        th.join()
if __name__ == ‘__main__‘:
    key_word = raw_input(‘input key word:‘)
    search(key_word)
本文出自 “12758454” 博客，请务必保留此出处http://12768454.blog.51cto.com/12758454/1973941
爬百度新闻
原文：http://12768454.blog.51cto.com/12758454/1973941
踩
(0)
评论一句话评论（0）
分享档案
更多>
2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)