首页 > 其他 > 详细

开源搜索引擎abelkhan

时间:2016-03-03 13:08:14      阅读:244      评论:0      收藏:0      [点我收藏+]

发起一个开源项目http://www.abelkhan.com/

 

目前而言,已经用python编写了一个网络爬虫抓取页面,和一个简单的前端

 

网络爬虫,已经有很多高手写过,我基本上奉行了拿来主义,

得益于python完善的lib,这个网络爬虫实现起来非常的简单:

 

使用urllib2从对应的url地址抓取html

def get_page(url):
    try:
        headers = {User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240,
                   Connection:Keep-Alive,
                   Accept:text/html, application/xhtml+xml, image/jxr, */*,
                   Accept-Language:zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.5,en;q=0.3,
                   }

        cookie_jar = cookielib.CookieJar()
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar))
        req = urllib2.Request(url = url, headers = headers)
        response = opener.open(req, timeout = 5)
        the_page = response.read()
        headers = response.info()

        return the_page, headers
    except:
        import traceback
        traceback.print_exc()

一个需要注意的地方是,有部分网站会限制爬虫访问,所以我加入了headers用于模拟浏览器访问。

这个方法差强人意,但是我也没有找到一个更完善的办法。

 

抓取到页面后,基于HTMLParser做了html的解析:

class htmlprocess(HTMLParser.HTMLParser):
    def __init__(self, urlinfo):
        HTMLParser.HTMLParser.__init__(self)

        self.urllist = {}
        self.sub_url = ""

        self.urlinfo = urlinfo
        self.current_url = urlinfo[url]

        keywords = doclex.simplesplit(self.current_url)
        for key in keywords:
            if key != "com" and key != "www" and key != "cn":
                self.urlinfo[keys][1].append(key)

        self.current_tag = ""
        self.style = ""

    def handle_starttag(self, tag, attrs):
        self.current_tag = tag
        self.style = None
        self.sub_url = ""

        if tag == meta:
            for name,value in attrs:
                if name == name:
                    if value == keywords or value == metaKeywords:
                        self.style = keywords
                    elif value == description or value == metaDescription:
                        self.style = profile

            for name,value in attrs:
                if name == content:
                    if self.style == keywords:
                        keywords = doclex.simplesplit(value)
                        if isinstance(keywords, list):
                            for key in keywords:
                                self.urlinfo[keys][1].append(key)
                    elif self.style == profile:
                        self.urlinfo[profile][0] = value

                    encodingdate = chardet.detect(value)
                    if encodingdate[encoding]:
                        udata = unicode(value, encodingdate[encoding])
                        tlen = 16
                        if len(udata) < 16:
                            tlen = len(udata)
                        self.urlinfo[titlegen].append(udata[0:tlen].encode(utf-8))
                    else:
                        self.urlinfo[titlegen].append(value)

        if tag == a or tag == A or tag == link:
            self.sub_url = ""
            for name,value in attrs:
                if name == href:
                    if len(value) == 0:
                        return

                    if not judged_url(value):
                        if self.current_url[len(self.current_url) - 1] != / and value[0] != /:
                            value = self.current_url + / + value
                        else:
                            value = self.current_url + value

                    if value.find(javascript) != -1:
                        return

                    if value.find(javaScript) != -1:
                        return

                    if self.current_url.find("apple") != -1:
                        if value.find("http://www.apple.com/cn/mac#ac-gn-menustate") !=-1:
                            return

                    if self.current_url.find("cnblogs") != -1:
                        if value.find("http://msg.cnblogs.com/send?recipient=itwriter") != -1:
                            return
                        elif value.find("http://i.cnblogs.com/EditPosts.aspx?opt=1") != -1:
                            return
                        elif value.find("http://i.cnblogs.com/EditPosts.aspx?postid=1935371") != -1:
                            return
                        elif value.find("http://msg.cnblogs.com/send?recipient=itwriter/") != -1:
                            return
                        elif value.find("http://msg.cnblogs.com/send?recipient=itwriter/GetUsername.aspx") != -1:
                            return
                        elif value.find("/EnterMyBlog.aspx?NewArticle=1") != -1:
                            return
                        elif value.find("GetUsername") != -1:
                            return
                        elif value.find("GetMyPassword") != -1:
                            return
                        elif value.find("http://i.cnblogs.com/EditPosts.aspx?postid=") != -1:
                            return
                        elif value[len(value) - 1] == #:
                            value = value[0:-1]

                    if self.current_url.find(value) != -1:
                        return

                    if value[len(value) - 1] == #:
                        value = value[0:-1]

                    if value != self.current_url and len(value) < 64 and not ingoreurl(value):
                        self.urllist[value] = {url:value, keys:{1:[], 2:[], 3:[]}, title:‘‘, titlegen:[], profile:{0:‘‘, 1:‘‘, 2:[]}}
                        self.sub_url = value
                        print value

    def handle_data(self, data):
        if self.current_tag == title:
            try:
                data = doclex.delspace(data)
                keys = doclex.lex(data)
                if isinstance(keys, list) and len(keys) > 0:
                    for key in keys:
                        self.urlinfo[keys][2].append(key)
                if len(data) > 0:
                    self.urlinfo[title] = data
            except:
                import traceback
                traceback.print_exc()

        elif self.current_tag == a:
            try:
                if self.sub_url != "":
                    keys = doclex.simplesplit(data)
                    if isinstance(keys, list) and len(keys) > 0:
                        for key in keys:
                            if key in self.urllist[self.sub_url][keys][3]:
                                self.urllist[self.sub_url][keys][3].remove(key)
                            if key not in self.urllist[self.sub_url][keys][1] and key not in self.urllist[self.sub_url][keys][2]:
                                self.urllist[self.sub_url][keys][2].append(key)

                    encodingdate = chardet.detect(data)
                    if encodingdate[encoding]:
                        udata = unicode(data, encodingdate[encoding])
                        tlen = 16
                        if len(udata) < 16:
                            tlen = len(udata)
                        self.urllist[self.sub_url][titlegen].append(udata[0:tlen].encode(utf-8))
                        if len(udata) > 16:
                            self.urllist[self.sub_url][profile][1] = udata[0:32].encode(utf-8)

            except:
                import traceback
                traceback.print_exc()
        else:
            try:
                if not doclex.invialddata(data):
                    data = doclex.delspace(data)

                    encodingdate = chardet.detect(data)
                    udata = unicode(data, encodingdate[encoding])
                    tlen = 16
                    if len(udata) < 16:
                        tlen = len(udata)
                    self.urlinfo[titlegen].append(udata[0:tlen].encode(utf-8))

                    if len(udata) > 32:
                        self.urlinfo[profile][2].append((udata[0:32] + u"...").encode(utf-8))

                    keys1 = doclex.lex(data)
                    for key in keys1:
                        self.urlinfo[keys][3].append(key)

            except:
                import traceback
                traceback.print_exc()

基本上,要说的就是HTMLParser使用方法见文档,HTMLParser预先了定义了一组虚接口handle_starttag,handle_data和handle_endtag,使用者通过重载这三个接口,来实现对html中的tag进行处理,进而完整的解析抓取到的html。

然后从搜索结果来看,搜索的质量还很不尽如人意,欢迎大家的参与和提出意见

 

代码托管地址如下:https://github.com/abelkhan 欢迎大家参与

开源搜索引擎abelkhan

原文:http://www.cnblogs.com/qianqians/p/5237869.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!