#################################
#data:2017-10-1
#version:1.0
# -*- coding:utf-8 -*-
import threading
import re
import urllib2
import chardet
from BeautifulSoup import BeautifulSoup
import time
import os
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
class myThreads(threading.Thread):
def __init__(self,threadname,filename):
threading.Thread.__init__(self)
self.threadname=threadname
self.filename=filename
def run(self):
print "Starting download url:",self.threadname
extract_news_content(self.threadname,self.filename)
time.sleep(2)
print "Exiting " + self.threadname
def remove_js_css(content):
""" remove the the javascript and the stylesheet and the comment content (<script>....</script> and <style>....</style> <!-- xxx -->) """
r = re.compile(r‘‘‘<script.*?</script>‘‘‘, re.I | re.M | re.S)
s = r.sub(‘‘, content)
r = re.compile(r‘‘‘<style.*?</style>‘‘‘, re.I | re.M | re.S)
s = r.sub(‘‘, s)
r = re.compile(r‘‘‘<!--.*?-->‘‘‘, re.I | re.M | re.S)
s = r.sub(‘‘, s)
r = re.compile(r‘‘‘<meta.*?>‘‘‘, re.I | re.M | re.S)
s = r.sub(‘‘, s)
r = re.compile(r‘‘‘<ins.*?</ins>‘‘‘, re.I | re.M | re.S)
s = r.sub(‘‘, s)
return s
def remove_empty_line(content):
"""remove multi space """
r = re.compile(r‘‘‘^\s+$‘‘‘, re.M | re.S)
s = r.sub(‘‘, content)
r = re.compile(r‘‘‘\n+‘‘‘, re.M | re.S)
s = r.sub(‘\n‘, s)
return s
def remove_any_tag(s):
s = re.sub(r‘‘‘<[^>]+>‘‘‘, ‘‘, s)
return s.strip()
def remove_any_tag_but_a(s):
text = re.findall(r‘‘‘<a[^r][^>]*>(.*?)</a>‘‘‘, s, re.I | re.S | re.S)
text_b = remove_any_tag(s)
return len(‘‘.join(text)), len(text_b)
def remove_image(s, n=50):
image = ‘a‘ * n
r = re.compile(r‘‘‘<img.*?>‘‘‘, re.I | re.M | re.S)
s = r.sub(image, s)
return s
def remove_video(s, n=1000):
video = ‘a‘ * n
r = re.compile(r‘‘‘<embed.*?>‘‘‘, re.I | re.M | re.S)
s = r.sub(video, s)
return s
def sum_max(values):
cur_max = values[0]
glo_max = -999999
left, right = 0, 0
for index, value in enumerate(values):
cur_max += value
if (cur_max > glo_max):
glo_max = cur_max
right = index
elif (cur_max < 0):
cur_max = 0
for i in range(right, -1, -1):
glo_max -= values[i]
if abs(glo_max < 0.00001):
left = i
break
return left, right + 1
def method_1(content, k=1):
if not content:
return None, None, None, None
tmp = content.split(‘\n‘)
group_value = []
for i in range(0, len(tmp), k):
group = ‘\n‘.join(tmp[i:i + k])
group = remove_image(group)
group = remove_video(group)
text_a, text_b = remove_any_tag_but_a(group)
temp = (text_b - text_a) - 8
group_value.append(temp)
left, right = sum_max(group_value)
return left, right, len(‘\n‘.join(tmp[:left])), len(‘\n‘.join(tmp[:right]))
def extract(content):
content = remove_empty_line(remove_js_css(content))
left, right, x, y = method_1(content)
return ‘\n‘.join(content.split(‘\n‘)[left:right])
# 输入url,将其新闻页的正文输入txt
def extract_news_content(web_url, file_name):
html=""
request = urllib2.Request(web_url)
# 在请求加上头信息,伪装成浏览器访问
request.add_header(‘User-Agent‘,
‘Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6‘)
opener = urllib2.build_opener()
try:
html = opener.open(request).read()
except urllib2.HTTPError, e:
print e.code
except urllib2.URLError, e:
print e.reason
infoencode = chardet.detect(html)[‘encoding‘] ##通过第3方模块来自动提取网页的编码
if html != None and infoencode != None: # 提取内容不为空,error.或者用else
html = html.decode(infoencode, ‘ignore‘)
soup = BeautifulSoup(html)
content = soup.renderContents()
content_text = extract(content) # 提取新闻网页中的正文部分,化为无换行的一段文字
content_text = re.sub(" ", " ", content_text)
content_text = re.sub(">", "", content_text)
content_text = re.sub(""", ‘""‘, content_text)
content_text = re.sub("<[^>]+>", "", content_text)
content_text = re.sub("\n", "", content_text)
file = open(file_name, ‘a‘) # append
file.write(content_text)
file.close()
# 抓取百度新闻搜索结果:中文搜索,前10页,url:key=关键词
def search(key):
search_url = ‘http://news.baidu.com/ns?word=key_word&tn=news&from=news&cl=2&rn=20&ct=1‘
req2 = urllib2.Request(search_url.replace(‘key_word‘, key))
req2.add_header(‘User-Agent‘,
‘Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6‘)
req = urllib2.urlopen(req2, timeout=10)
real_visited = 0
threads=[]
for count in range(50): # 前10页
html = req.read()
soup = BeautifulSoup(html)
content = soup.findAll("div", {"class": "result"}) # resultset object
num = len(content) #每页20行数据
print "data:",num,count
file_dir = r"E:\\Python27\\newscn\\%s" % (key.encode(‘gb2312‘))
if not os.path.exists(file_dir):
os.mkdir(file_dir)
for i in range(num):
# 先解析出来所有新闻的标题、来源、时间、url
p_str = content[i].find(‘a‘) # if no result then nontype object
contenttitle = p_str.renderContents()
contenttitle = contenttitle.decode(‘utf-8‘, ‘ignore‘) # need it
contenttitle = re.sub("<[^>]+>", "", contenttitle)
contentlink = str(p_str.get("href"))
# 存放顺利抓取的url,对比
visited_url = open(r‘E:\\Python27\\visited-cn.txt‘, ‘r‘) # 是否已经爬过
visited_url_list = visited_url.readlines()
visited_url.close() # 及时close
exist = 0
for itme in visited_url_list:
if itme.strip(‘\n‘) == contentlink:
exist=1
continue
print "url:",contentlink,"status",exist
if exist != 1: # 如果未被访问url
p_str2 = content[i].find(‘p‘).renderContents()
contentauthor = p_str2[:p_str2.find("  ")] # 来源
contentauthor = contentauthor.decode(‘utf-8‘, ‘ignore‘) # 时
contenttime = p_str2[p_str2.find("  ") + len("  ") + 1:]
contenttime = contenttime.decode(‘utf-8‘, ‘ignore‘)
real_visited += 1
file_name = r"E:\\Python27\\newscn\\%s\\%d.txt" % (key.encode(‘gb2312‘),real_visited)
file = open(file_name, ‘w‘)
file.write(contenttitle.encode(‘utf-8‘))
file.write(u‘\n‘)
file.write(contentauthor.encode(‘utf-8‘))
file.write(u‘\n‘)
file.write(contenttime.encode(‘utf-8‘))
file.write(u‘\n‘ + contentlink + u‘\n‘)
file.close()
threadnew=myThreads(contentlink,file_name)
threads.append(threadnew)
# extract_news_content(contentlink, file_name) # 还写入文件
visited_url_list.append(contentlink) # 访问之
visited_url = open(r‘E:\\Python27\\visited-cn.txt‘, ‘a‘) # 标记为已访问,永久存防止程序停止后丢失
visited_url.write(contentlink + u‘\n‘)
visited_url.close()
if len(visited_url_list) >= 120:
break
# 解析下一页
print "page:",count,"url address:",visited_url_list
if len(visited_url_list) >= 240:# 最多12页
break
if count == 0:
next_num = 0
else:
next_num = 1
try:
next_page = ‘http://news.baidu.com‘ + soup(‘a‘, {‘href‘: True, ‘class‘: ‘n‘})[next_num][ ‘href‘] # search for the next page#翻页
except:
break
req2 = urllib2.Request(next_page)
req2.add_header(‘User-Agent‘,
‘Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6‘)
req=urllib2.urlopen(req2, timeout=10)
for th in threads:
th.setDaemon(True)
th.start()
for th in threads:
th.join()
if __name__ == ‘__main__‘:
key_word = raw_input(‘input key word:‘)
search(key_word)本文出自 “12758454” 博客,请务必保留此出处http://12768454.blog.51cto.com/12758454/1973941
原文:http://12768454.blog.51cto.com/12758454/1973941